## Prepping Data
- Query in IPUMS 
- Download R script, data extract
- Modify R script to add `write.csv(data, file = "census.csv")`

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import metrics
import time

In [None]:
data = pd.read_csv('./data/census.csv').drop(columns='Unnamed: 0')

In [None]:
data.head()

For neighborhood type category, need to calculate tract-weighted average density for each PUMA

In [None]:
puma_total_counts_df = data['PUMA'].value_counts()
data['PUMA_TOTAL_POP'] = data['PUMA'].map(puma_total_counts_df)

Perform one-hot encoding on categorical data (sex, race/ethnicity, neighborhood type, marital status, education, employment status), extract relevant columns for clustering

In [None]:
data = pd.get_dummies(data, columns=['SEX', 'RACE', 'MARST', 'EMPSTAT'])

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.shape

Extract only relevant columns and normalize

In [None]:
relevant_data = data.filter(regex='AGE|RACE$|SEX|PUMA_TOTAL_POP|MARST|EDUC$|FTOTINC|EMPSTAT')

In [None]:
relevant_data.columns

In [None]:
scaler = StandardScaler()
scaler.fit(relevant_data)
scaled_data = scaler.transform(relevant_data)

In [None]:
runs = 2
num_clusters_list = []
sample_ratio = .1
dbscan_min_samples = 500

time_flag = False # use this to measure the time to run DBSCAN for each sample run, for me it was ~4500s per iteration using 

if time_flag:
    t0 = time.time()

for _ in range(runs):
    num_samples = int(sample_ratio * scaled_data.shape[0])
    samples = scaled_data[np.random.choice(scaled_data.shape[0], num_samples, replace=True)]
    db = DBSCAN(min_samples=dbscan_min_samples).fit(samples)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    num_clusters_list.append(n_clusters_)
    
    if time_flag:
        t1 = time.time()
        delta = t1 - t0
        t0 = t1
        print(delta)

In [None]:
num_clusters_list

In [None]:
kmeans = KMeans(n_clusters=samples_list[0]).fit(scaled_data)
kmeans.cluster_centers_

Use DSCAN to estimate # of clusters

In [None]:
db = DBSCAN(min_samples=500).fit(samples)

In [None]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels,
                                           average_method='arithmetic'))

In [None]:
kmeans = KMeans(n_clusters=n_clusters).fit(scaled_data)

In [None]:
scaler.inverse_tranform(kmeans.cluster_centers_)

In [None]:
data