In [1]:
from peewee import *
import numpy as np
import numpy.random as rnd
import pandas as pd
import requests
import math
from datetime import datetime
from Model import *
import re # Regular Expression

import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
from sklearn import preprocessing
from scipy.cluster.vq import whiten

import dglim
from dglim import City

%matplotlib inline

# Load Datasets

In [2]:
master_df = dglim.loadData('Master Dataset')
tracts_df = dglim.loadData('Tracts Dataset')
blocks_df = dglim.loadData('Blocks Dataset')

# Format Data

In [3]:
def makeSparse(column, df=master_df):
    column_data = df[column]
    sparse_df = pd.DataFrame(index=df.index)
    labels = df[column].unique()
    for label in labels:
        sparse_df[column + " - " + label] = pd.Series(column_data == label).astype(int)
    
    return sparse_df

In [4]:
feature_data = {}

# Positional data
feature_data['Latitude'] = pd.DataFrame(master_df['Latitude'][abs(master_df['Latitude'] - City.latitude) < .2]).dropna()
feature_data['Longitude'] = pd.DataFrame(master_df['Longitude'][abs(master_df['Longitude'] - City.longitude) < .2]).dropna()

# Categorical data
feature_data['Business Type'] = makeSparse('Business Type')
feature_data['NAICS Type'] = makeSparse('NAICS Type')

# Dates
feature_data['Age'] = pd.DataFrame((master_df['Start Date'] - datetime.now()).rename('Age').apply(lambda x: x.days/365.25)).dropna()

# Successfulness
successLabelValues = {
    'Very Successful' : 3,
    'Somewhat Successful' : 2,
    'Somewhat Unsuccessful' : 1,
    'Very Unsuccessful' : 0
}
feature_data['Successfulness'] = pd.DataFrame(master_df['Successfulness'].map(successLabelValues)).dropna()

# Financials
df = pd.DataFrame((master_df['Revenue in 2016'] - master_df['Expenses in 2016']).rename('Profit in 2016')).dropna()
feature_data['Profit in 2016'] = df

df = pd.DataFrame((master_df['Revenue in 2017'] - master_df['Expenses in 2017']).rename('Profit in 2017')).dropna()
feature_data['Profit in 2017'] = df

# Copy simple numerical data
feature_data['Number of Employees']  = pd.DataFrame(master_df['Number of Employees']).dropna()
feature_data['Revenue in 2016']      = pd.DataFrame(master_df['Revenue in 2016']).dropna()
feature_data['Expenses in 2016']     = pd.DataFrame(master_df['Expenses in 2016']).dropna()
feature_data['Investment in 2016']   = pd.DataFrame(master_df['Investment in 2016']).dropna()
feature_data['Revenue in 2017']      = pd.DataFrame(master_df['Revenue in 2017']).dropna()
feature_data['Expenses in 2017']     = pd.DataFrame(master_df['Expenses in 2017']).dropna()
feature_data['Investment in 2017']   = pd.DataFrame(master_df['Investment in 2017']).dropna()
feature_data['Crimes Within 500m']   = pd.DataFrame(master_df['Crimes Within 500m']).dropna()
feature_data['Distance to Bus Stop'] = pd.DataFrame(master_df['Distance to Bus Stop']).dropna()

# Gather types of features
all_features = set(feature_data.keys())

all_categorical_features = {
    'Business Type',
    'NAICS Type',
}

all_numerical_features = all_features - all_categorical_features

NameError: name 'categorical_features' is not defined

# Format Data

In [None]:
feature_data['Profit in 2017']

# Do Some Clustering

In [None]:
feature_data.keys()

In [None]:
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

# Generate random points
# x = []
# y = []
# for i in range(0, 500):
#     x.append(rnd.randn())
#     y.append(rnd.randn())
# data = np.array(zip(x, y))

# latitude = [x for x in master_df['Latitude']] #if np.isfinite(x) and abs(x - 29.65) < .2 else np.nan) for x in master_df['Latitude']]
# longitude = [x for x in master_df['Longitude']]# if np.isfinite(x) and abs(x - -82.32) < .2 else np.nan]
# successfulness = [(4 - x) for x in master_df['Successfulness']]

features = {
#     'Latitude',
#     'Longitude',
#     'Business Type',
#     'NAICS Type',
    'Age',
#     'Number of Employees',
    'Successfulness',
#     'Profit in 2016',
    'Revenue in 2016',
    'Expenses in 2016',
#     'Investment in 2016',
#     'Profit in 2017',
    'Revenue in 2017',
    'Expenses in 2017',
#     'Investment in 2017',
#     'Crimes Within 500m',
#     'Distance to Bus Stop',
}

# Gather used features in a single dataset
data_df = pd.DataFrame(index=master_df.index)
for feature in features:
    data_df = data_df.join(feature_data[feature])

data_df = data_df.dropna()

# Normalize data
norm_data_df = data_df.copy()
for feature in (features & all_numerical_features):
    print "Normalizing", feature
    norm_data_df[feature] = whiten(norm_data_df[feature].astype('float'))

data = np.array(norm_data_df.as_matrix())

print len(data_df), 'entries'
#print 'Sample data entry:', data[0]

if (len(features) == 1):
    plt.scatter(data.T[0], data.T[0], c='b', **plot_kwds)
else:
    plt.scatter(data.T[0], data.T[1], c='b', **plot_kwds)

frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)

In [None]:
# Shamelessly borrowed from online
# def plot_clusters(data, algorithm, args, kwds):
#     labels = algorithm(*args,**kwds).fit_predict(data)
#     palette = sns.color_palette('deep', np.unique(labels).max() + 1)
#     colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
#     plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
#     frame = plt.gca()
#     frame.axes.get_xaxis().set_visible(False)
#     frame.axes.get_yaxis().set_visible(False)
#     plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    
#     return labels

In [None]:
# Input
data = data
algorithm = cluster.KMeans
args = ()
kwds = {'n_clusters':6}

labels = algorithm(*args, **kwds).fit_predict(data)
palette = sns.color_palette('deep', np.unique(labels).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)

# Format Results

In [None]:
def makeCompactSeries(sparse_df, column, df=master_df):
    compact_df = pd.DataFrame(index=df.index, columns=[column])
    values = df[column].unique()
    for value in values:
        label = column + " - " + value
        new_data = sparse_df[label].map({0: np.nan, 1: value}).dropna()
        compact_df[column][new_data.index] = new_data

    return compact_df

In [None]:
results_df = pd.DataFrame(index=data_df.index)

# Compact sparse data
for feature in (all_categorical_features & features):
    results_df[feature] = makeCompactSeries(data_df, feature)

# Copy non-sparse data
for feature in ((all_features & features) - all_categorical_features):
    results_df[feature] = data_df[feature]

# Include cluster labels
results_df.insert(loc=0, column='Cluster', value=labels)

results_df.sort_values('Cluster')

In [None]:
results_df['Cluster'].value_counts()

In [None]:
results_df.sort_values('Cluster')