In [1]:
# Display plots in the notebook
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [143]:
# Common imports
import numpy as np
import pylab as pl
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import scale

In [3]:
# Make plots prettyful
pl.style.use('fivethirtyeight')

# Question 1

In [146]:
data = pd.read_csv('original_with_duplicates_noid.csv')

In [152]:
values = scale(pd.get_dummies(data, columns=['year', 'day', 'month', 'suspect.race', 'suspect.build', 'suspect.sex', 'location.housing']).values)



In [153]:
db = DBSCAN(min_samples=1, eps=1)
labels = db.fit_predict(values)
print("Number of unique elements via clustering:", len(np.unique(labels)))

Number of unique elements via clustering: 997


In [154]:
print("Number of unique elements via row-wise detection:", len(data[~data.duplicated()]))

Number of unique elements via row-wise detection: 997


I first hashed the data to convert all the data to numeric types (instead of strings0. I used DBSCAN to cluster the data using all the columns in the dataset.  I used DBSCAN because it does not require the number of clusters to be specified.  

I used pandas's built in duplicate detection to validate this result.

# Question 2

In [159]:
data = pd.read_csv('original_with_errors_noid.csv')

In [197]:
km = KMeans(n_clusters=1000)
labels = km.fit_predict(values)

In [183]:
values = scale(data.applymap(lambda x: hash(x)).values)

# Find the optimal distance
for eps in np.arange(.1,5,.1):
    db = DBSCAN(min_samples=1, eps=eps)
    labels = db.fit_predict(values)

    # Assume that there should only be at most 2 points in a cluster.
    # This assumption may be wrong.
    # If there is a bin with 3 or more, stop looping and and revert to previous value
    if max(np.bincount(labels)) > 2:
        break

# Readjust eps
eps -= .1
print("Optimal eps:", eps)



Optimal eps: 1.3


In [184]:
# Rerun DBSCAN with optimal EPS
db = DBSCAN(min_samples=1, eps=eps)
labels = db.fit_predict(values)

In [185]:
print("Number of unique elements via clustering:", len(np.unique(labels)))

Number of unique elements via clustering: 997


In [198]:
# Figure out which features had errors introduced by diffing the rows in each cluster and picking out the non-zero features
error_columns = set()
for i in np.nonzero(np.bincount(labels) > 1)[0]:
    columns = data.columns[np.nonzero(np.diff(values[labels == i], axis=0)[0])]
    for c in columns:
        error_columns.add(c)
print("Features with errors:", error_columns)

Features with errors: {'suspect.weight', 'suspect.age'}


In [192]:
data[labels == 1]

Unnamed: 0,year,found.weapon,found.gun,arrested,suspect.race,suspect.age,suspect.build,suspect.sex,suspect.height,suspect.weight,...,additional.associating,additional.direction,additional.highcrime,additional.time,additional.sights,additional.other,radio.run,day,month,time.period
1,2012,False,False,False,black,24,medium,male,5.666667,155,...,False,False,True,False,False,False,False,Monday,August,6
939,2012,False,False,False,black,21,medium,male,5.666667,160,...,False,False,True,False,False,False,False,Monday,August,6


In this case, since the points won't be exactly in the same place, we're not sure how far apart we should expect them to be.  I assumed that there would only be one duplicate of any particular row (which may be a wrong assumption) and searched for the correct distance (eps) using that assumption. After doing that, I found the features that were altered by diffing the features in each cluster to find non-zero differences.  

In [79]:
np.arange(.1,2,.1)

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9])

In [130]:
data = pd.read_csv('cpw_stops_2012.csv')

In [136]:
values = scale(data.applymap(lambda x: hash(x)).values)



In [144]:
km = KMeans()
labels = km.fit_predict(values)

In [145]:
np.unique(labels)

array([0, 1, 2, 3, 4, 5, 6, 7])