In [1]:
import pandas as pd
import numpy as np
# Import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import sys
sys.path.insert(0,'../')
%load_ext autoreload
%autoreload 2

from math import log

import datetime 
from datetime import datetime as dt
from geopy.distance import vincenty # requires separate install - pip install geopy
import warnings
warnings.filterwarnings('ignore')

#Imports for Classification
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#Split Training and Testing Set.
from sklearn.model_selection import train_test_split

#Generate Classification Performance Results
from sklearn.metrics import classification_report

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#Import imputer
from sklearn.preprocessing import Imputer

# statistic analysis
from sklearn.feature_selection import f_classif
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [2]:
# Load in the data with `read_csv()`
Charlotte_IGNORE = pd.read_csv("Datasets/Charlotte-Mecklenburg_2011-2016_IGNORE_KNN.csv", header = 0)
Charlotte_FIXED = pd.read_csv("Datasets/Charlotte-Mecklenburg_2011-2016_FIXED_KNN.csv", header = 0)
Charlotte_MEAN = pd.read_csv("Datasets/Charlotte-Mecklenburg_2011-2016_MEAN_KNN.csv", header = 0)
Charlotte_MEDIAN = pd.read_csv("Datasets/Charlotte-Mecklenburg_2011-2016_MEDIAN_KNN.csv", header = 0)
Charlotte_MOST_FREQUENT = pd.read_csv("Datasets/Charlotte-Mecklenburg_2011-2016_MOST_FREQUENT_KNN.csv", header = 0)

Charlotte_IGNORE.fillna(-1, inplace = True)
Charlotte_FIXED.fillna(-1, inplace = True)
Charlotte_MEAN.fillna(-1, inplace = True)
Charlotte_MEDIAN.fillna(-1, inplace = True)
Charlotte_MOST_FREQUENT.fillna(-1, inplace = True)

In [3]:
def createCRClass(df, nClass):
    df['Violent_Crime_Rate_Log2'] = 0.0
    df['Property_Crime_Rate_Log2'] = 0.0
    df['Combined_Crime_Rate_Log2'] = 0.0

    for index, row in df.iterrows():
        if(row['Violent_Crime_Rate'] != -1):
            if(row['Violent_Crime_Rate'] < 1):
                df.at[index, 'Violent_Crime_Rate_Log2'] = df.at[index, 'Violent_Crime_Rate']
            else:
                df.at[index, 'Violent_Crime_Rate_Log2'] = log(df.at[index, 'Violent_Crime_Rate'], 2.0)
        else:
            df.at[index, 'Violent_Crime_Rate_Log2'] = -1

        if(row['Property_Crime_Rate'] != -1):
            if(row['Property_Crime_Rate'] < 1):
                df.at[index, 'Property_Crime_Rate_Log2'] = df.at[index, 'Property_Crime_Rate']
            else:
                df.at[index, 'Property_Crime_Rate_Log2'] = log(df.at[index, 'Property_Crime_Rate'], 2.0)
        else:
            df.at[index, 'Property_Crime_Rate_Log2'] = -1

        if(row['Combined_Crime_Rate'] != -1):
            if(row['Combined_Crime_Rate'] < 1):
                df.at[index, 'Combined_Crime_Rate_Log2'] = df.at[index, 'Combined_Crime_Rate']
            else:
                df.at[index, 'Combined_Crime_Rate_Log2'] = log(df.at[index, 'Combined_Crime_Rate'], 2.0)
        else:
            df.at[index, 'Combined_Crime_Rate_Log2'] = -1

    # interval values to retrieve crime rate from crime rate classes
    interval_vcr = max(df['Violent_Crime_Rate_Log2']) / nClass + 0.000001
    interval_pcr = max(df['Property_Crime_Rate_Log2']) / nClass + 0.000001
    interval_cmb = max(df['Combined_Crime_Rate_Log2']) / nClass + 0.000001

    df['Violent_Crime_Rate_Class'] = 0
    df['Property_Crime_Rate_Class'] = 0
    df['Combined_Crime_Rate_Class'] = 0


    for index, row in df.iterrows():
        if(row['Violent_Crime_Rate'] != -1):
            df.at[index, 'Violent_Crime_Rate_Class'] = int(df.at[index, 'Violent_Crime_Rate_Log2'] / interval_vcr)
        else:
            df.at[index, 'Violent_Crime_Rate_Class'] = -1

        if(row['Property_Crime_Rate'] != -1):
            df.at[index, 'Property_Crime_Rate_Class'] = int(df.at[index, 'Property_Crime_Rate_Log2'] / interval_pcr)
        else:
            df.at[index, 'Property_Crime_Rate_Class'] = -1
            
        if(row['Combined_Crime_Rate'] != -1):
            df.at[index, 'Combined_Crime_Rate_Class'] = int(df.at[index, 'Combined_Crime_Rate'] / interval_pcr)
        else:
            df.at[index, 'Combined_Crime_Rate_Class'] = -1

In [4]:
# define number of classes
NUM_OF_CLASS = 5
print(Charlotte_FIXED.shape)

createCRClass(Charlotte_IGNORE, NUM_OF_CLASS)
createCRClass(Charlotte_FIXED, NUM_OF_CLASS)
createCRClass(Charlotte_MEAN, NUM_OF_CLASS)
createCRClass(Charlotte_MEDIAN, NUM_OF_CLASS)
createCRClass(Charlotte_MOST_FREQUENT, NUM_OF_CLASS)
print(Charlotte_FIXED.shape)

(1848, 34)
(1848, 40)


In [5]:
InfluentialFeatures = pd.DataFrame(columns = ['IGNORE_VCR', 'IGNORE_PCR', 'IGNORE_CMD', 'FIXED_VCR', 'FIXED_PCR', 'FIXED_CMD',
                                              'MEAN_VCR', 'MEAN_PCR', 'MEAN_CMD', 'MEDIAN_VCR', 'MEDIAN_PCR', 'MEDIAN_CMD', 
                                              'MOST_FREQUENT_VCR', 'MOST_FREQUENT_PCR', 'MOST_FREQUENT_CMD'])

In [6]:
X = Charlotte_FIXED.loc[:, ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']]
Y = Charlotte_FIXED.loc[:,['Violent_Crime_Rate_Class']]

est = sm.OLS(Y, X)
est2 = est.fit()
features = pd.DataFrame()
features['p_values'] = est2.pvalues
coefficients = est2.params
coefficients = np.round(coefficients,4)
features['coefficients'] = coefficients
features['features'] = ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']
features.index = range(31)
features = features.sort_values('p_values')
print(features)

        p_values  coefficients                                 features
29  2.682445e-15        0.0077                        Transit_Proximity
26  7.785141e-14        0.1587                       Housing_Violations
18  1.193046e-12       -0.0002                             Housing_Size
28  3.207969e-10        0.2706                      Street_Connectivity
7   1.056095e-06        0.0075                  Commercial_Building_Age
22  2.198735e-06       -0.0432                   Residential_Renovation
6   2.775231e-05        0.0000                    Commercial_Size_Total
30  7.383878e-04       -0.0004                  Transit_Proximate_Units
5   2.093399e-03        0.0000                          Commercial_Size
19  2.098300e-03        0.0045                              Housing_Age
8   3.250033e-03       -0.0011                       Impervious_Surface
24  6.406294e-03        0.0686                             Foreclosures
1   7.370884e-03        0.0060                              Vaca

In [7]:
X = Charlotte_FIXED.loc[:, ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']]
Y = Charlotte_FIXED.loc[:,['Property_Crime_Rate_Class']]

est = sm.OLS(Y, X)
est2 = est.fit()
features = pd.DataFrame()
features['p_values'] = est2.pvalues
coefficients = est2.params
coefficients = np.round(coefficients,4)
features['coefficients'] = coefficients
features['features'] = ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']
features.index = range(31)
features = features.sort_values('p_values')
print(features)

        p_values  coefficients                                 features
8   7.369648e-26       -0.0036                       Impervious_Surface
29  1.386598e-11        0.0060                        Transit_Proximity
28  8.646858e-07        0.1955                      Street_Connectivity
12  1.235355e-06        0.0051                       Pharmacy_Proximity
7   7.325758e-06        0.0064                  Commercial_Building_Age
24  1.389991e-05        0.1013                             Foreclosures
19  3.342581e-05        0.0057                              Housing_Age
1   8.802664e-05        0.0082                              Vacant_Land
15  5.220793e-04        0.0005                            Housing_Units
5   9.863707e-04        0.0000                          Commercial_Size
22  2.136762e-03       -0.0259                   Residential_Renovation
26  2.502397e-03        0.0590                       Housing_Violations
9   3.259464e-03        0.0008                  Impervious_Surfa

In [8]:
X = Charlotte_FIXED.loc[:, ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']]
Y = Charlotte_FIXED.loc[:,['Combined_Crime_Rate_Class']]

est = sm.OLS(Y, X)
est2 = est.fit()
features = pd.DataFrame()
features['p_values'] = est2.pvalues
coefficients = est2.params
coefficients = np.round(coefficients,4)
features['coefficients'] = coefficients
features['features'] = ['NPA', 'Vacant_Land', 
    'Vacant_Land_Area', 'Commercial_Construction', 'Commercial_Construction_Permitted_Units', 'Commercial_Size', 
    'Commercial_Size_Total', 'Commercial_Building_Age', 'Impervious_Surface', 'Impervious_Surface_Area', 'Adopt_a_Stream', 
    'Adopt_a_Stream_Length', 'Pharmacy_Proximity', 'Pharmacy_Proximate_Units', 'Housing_Density', 'Housing_Units', 
    'Single_Family_Housing', 'Single_Family_Units', 'Housing_Size', 'Housing_Age', 'New_Residential', 'New_Residential_Permit_Units', 
    'Residential_Renovation', 'Residential_Renovation_Permit_Units', 'Foreclosures', 'Foreclosed_Units', 'Housing_Violations', 
    'Housing_Violations_Total', 'Street_Connectivity', 'Transit_Proximity', 'Transit_Proximate_Units']
features.index = range(31)
features = features.sort_values('p_values')
print(features)

        p_values  coefficients                                 features
6   1.488596e-13        0.0000                    Commercial_Size_Total
8   1.854704e-13       -0.0554                       Impervious_Surface
29  7.148306e-09        0.1152                        Transit_Proximity
1   5.389120e-08        0.2531                              Vacant_Land
30  1.321217e-04       -0.0082                  Transit_Proximate_Units
19  1.697635e-04        0.1146                              Housing_Age
2   4.012356e-04       -0.0077                         Vacant_Land_Area
16  1.868485e-03       -0.0761                    Single_Family_Housing
14  1.952099e-03       -1.2987                          Housing_Density
22  6.346192e-03       -0.5131                   Residential_Renovation
28  6.652360e-03        2.4000                      Street_Connectivity
18  3.486461e-02       -0.0013                             Housing_Size
12  5.659290e-02        0.0450                       Pharmacy_Pr