# Assignment 2B

Following code is used to gather crime data related to prostitution from the San Francisco Police Department. [Link to data](http://data.sfgov.org/api/views/tmnf-yvry/rows.csv?accessType=DOWNLOAD&api_foundry=true).

The data gathered:
* Coordinates of where prostitution crimes are happening and which cluster they belong to based on different values of k (2 to 6) using [k-means clustering analysis](https://en.wikipedia.org/wiki/K-means_clustering)
* Coordinates of cluster centers for different values of k (2 to 6)

In [1]:
import csv
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict
import json

In [2]:
# Read-in San Francisco crime data from 1st of January 2003
crimes = {}
with open('SFPD_Incidents_-_from_1_January_2003.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",")
    for entry in reader:
        crimes[
            entry["IncidntNum"]
        ] = {
        "category":     entry["Category"],
        "weekday":      entry["DayOfWeek"],
        "date":         entry["Date"],
        "time":         entry["Time"],
        "district":     entry["PdDistrict"],
        "latitude":     entry["Y"],
        "longitude":    entry["X"]
        }

In [3]:
# Go through the dictionary 'crimes' and get 'latitudes' and 'longitudes' values from crime category 'PROSTITUTION'
latitudes = []
longitudes = []
for key, value in crimes.iteritems():
    # Remove latitude values located wrongly at the North Pole
    if (float(crimes[key]["latitude"]) != 90) and (crimes[key]["category"] == "PROSTITUTION"):
        latitudes.append(float(crimes[key]["latitude"]))
        longitudes.append(float(crimes[key]["longitude"]))

In [4]:
# Create a list of lists for different labels for each k-means value from 2 to 6
k_labels = []
for k in range(2,7):
    X = np.matrix(zip(latitudes, longitudes))
    kmeans = KMeans(n_clusters=k).fit(X)
    k_labels.append(kmeans.labels_.tolist())

In [5]:
# Example from 'k_labels'
print "K2 labels (first 30 values):"
print k_labels[0][0:30]
print "K6 labels (first 30 values):"
print k_labels[4][0:30]

K2 labels (first 30 values):
[0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1]
K6 labels (first 30 values):
[4, 1, 2, 2, 5, 5, 2, 1, 2, 2, 5, 5, 1, 1, 1, 2, 5, 1, 2, 2, 1, 3, 2, 5, 1, 2, 2, 1, 1, 0]


In [6]:
# Write to a csv-file
with open("./data_2B/k_means_lat_lon.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerow(("lat", "lon", "k2", "k3", "k4", "k5", "k6"))
    writer.writerows(zip(latitudes, longitudes, k_labels[0], k_labels[1], k_labels[2], k_labels[3], k_labels[4],))

In [7]:
# Create a dictionary of cluster centers for each k value from 2 to 6
k_clusters = defaultdict(list)
for k in range(2,7):
    X = np.matrix(zip(latitudes, longitudes))
    kmeans = KMeans(n_clusters=k).fit(X)
    
    for i in kmeans.cluster_centers_:
        k_cluster = {}
        k_cluster["lat"] = i[0]
        k_cluster["lon"] = i[1]
        k_clusters["k" + str(k)].append(k_cluster)

In [8]:
# Example from 'k_clusters'
print "K2 cluster centers:"
print k_clusters["k2"]
print "K6 cluster centers:"
print k_clusters["k6"]

K2 cluster centers:
[{'lat': 37.787458880666151, 'lon': -122.41754940681371}, {'lat': 37.760368403407782, 'lon': -122.41897422113254}]
K6 cluster centers:
[{'lat': 37.7876967416654, 'lon': -122.41874244761991}, {'lat': 37.719644409717937, 'lon': -122.46783363226149}, {'lat': 37.761851155060633, 'lon': -122.41599255678452}, {'lat': 37.785387358602087, 'lon': -122.40500501448241}, {'lat': 37.75921173107831, 'lon': -122.48755520466297}, {'lat': 37.729802416763178, 'lon': -122.40478562671251}]


In [9]:
# Write to a json-file
with open('./data_2B/k_means_cluster_centers.json', 'w') as f:
    json.dump(k_clusters, f)