In [367]:
import pandas as pd # panda's nickname is pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score



In [368]:
## Load the communities-crime-clean.csv file into a dataframe object

In [369]:
communities_crime_df = pd.read_csv('communities-crime-clean.csv')

In [370]:
# Sanity test we have good data
communities_crime_df.head()

Unnamed: 0,state,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,1,Alabastercity,7,0.01,0.61,0.21,0.83,0.02,0.01,0.41,...,0.03,0.7,0.4,0.34,0.57,0.05,0.06,0.01,0.0,0.06
1,1,AlexanderCitycity,10,0.01,0.41,0.55,0.57,0.01,0.0,0.47,...,0.0,0.93,0.66,0.82,0.84,0.11,0.03,0.01,0.0,0.14
2,1,Annistoncity,3,0.03,0.34,0.86,0.3,0.04,0.01,0.41,...,0.04,0.77,0.59,0.7,0.64,0.06,0.11,0.04,0.0,1.0
3,1,Athenscity,8,0.01,0.38,0.35,0.71,0.04,0.01,0.39,...,0.03,0.78,0.56,0.67,0.71,0.09,0.05,0.0,0.0,0.23
4,1,Auburncity,1,0.04,0.37,0.32,0.7,0.21,0.02,1.0,...,0.12,0.49,0.12,0.0,0.15,0.09,0.09,0.01,0.0,0.15


In [371]:
## Create a new field “highCrime” which is true if the crime rate per capita (ViolentCrimesPerPop) is greater than 0.1, and false otherwise
def setHighCrime(df):
    if df['ViolentCrimesPerPop'] > 0.1:
        return True
    else:
        return False
  
communities_crime_df['highCrime'] = communities_crime_df.apply(setHighCrime, axis=1)
communities_crime_df.head()

Unnamed: 0,state,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,...,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop,highCrime
0,1,Alabastercity,7,0.01,0.61,0.21,0.83,0.02,0.01,0.41,...,0.7,0.4,0.34,0.57,0.05,0.06,0.01,0.0,0.06,False
1,1,AlexanderCitycity,10,0.01,0.41,0.55,0.57,0.01,0.0,0.47,...,0.93,0.66,0.82,0.84,0.11,0.03,0.01,0.0,0.14,True
2,1,Annistoncity,3,0.03,0.34,0.86,0.3,0.04,0.01,0.41,...,0.77,0.59,0.7,0.64,0.06,0.11,0.04,0.0,1.0,True
3,1,Athenscity,8,0.01,0.38,0.35,0.71,0.04,0.01,0.39,...,0.78,0.56,0.67,0.71,0.09,0.05,0.0,0.0,0.23,True
4,1,Auburncity,1,0.04,0.37,0.32,0.7,0.21,0.02,1.0,...,0.49,0.12,0.0,0.15,0.09,0.09,0.01,0.0,0.15,True


In [372]:
total_instance = communities_crime_df['highCrime'].count()
positive_instance = communities_crime_df[communities_crime_df["highCrime"] == True]["highCrime"].count()
negative_instance = communities_crime_df[communities_crime_df["highCrime"] == False]["highCrime"].count()
percentage_positive_instance = positive_instance/total_instance * 100
percentage_negative_instance = negative_instance/total_instance * 100
print("Percentage positive instance = ", percentage_positive_instance)
print("Percentage negative instance = ", percentage_negative_instance)

Percentage positive instance =  62.7195183141
Percentage negative instance =  37.2804816859


In [373]:
def setTarget(df):
    if df['highCrime'] == True:
        return 1
    else:
        return 0
  

communities_crime_df.drop('ViolentCrimesPerPop', axis=1, inplace=True)
communities_crime_df.head()
#communities_crime_df["highCrime"].unique()

features = list(communities_crime_df.columns[2:103])
communities_crime_df['highCrime_Boolean'] = communities_crime_df.apply(setTarget, axis=1)

y = communities_crime_df["highCrime_Boolean"]
X = communities_crime_df[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)

dt.fit(X,y)
p = dt.predict(communities_crime_df[features])
#data_frame = communities_crime_df.DataFrame(p,columns=['Predict_HighCrime'])
communities_crime_df['Predicted_HighCrime'] = pd.Series(p, index=communities_crime_df.index)

communities_crime_df.head()

Unnamed: 0,state,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,...,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,highCrime,highCrime_Boolean,Predicted_HighCrime
0,1,Alabastercity,7,0.01,0.61,0.21,0.83,0.02,0.01,0.41,...,0.4,0.34,0.57,0.05,0.06,0.01,0.0,False,0,0
1,1,AlexanderCitycity,10,0.01,0.41,0.55,0.57,0.01,0.0,0.47,...,0.66,0.82,0.84,0.11,0.03,0.01,0.0,True,1,1
2,1,Annistoncity,3,0.03,0.34,0.86,0.3,0.04,0.01,0.41,...,0.59,0.7,0.64,0.06,0.11,0.04,0.0,True,1,1
3,1,Athenscity,8,0.01,0.38,0.35,0.71,0.04,0.01,0.39,...,0.56,0.67,0.71,0.09,0.05,0.0,0.0,True,1,1
4,1,Auburncity,1,0.04,0.37,0.32,0.7,0.21,0.02,1.0,...,0.12,0.0,0.15,0.09,0.09,0.01,0.0,True,1,1


In [374]:
## Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, true positives is C_{1,1} and false positives is C_{0,1}.
confusion_matrix(communities_crime_df['highCrime_Boolean'], communities_crime_df['Predicted_HighCrime'])

array([[ 663,   80],
       [  45, 1205]])

In [375]:
recall_score = recall_score(communities_crime_df['highCrime_Boolean'], communities_crime_df['Predicted_HighCrime'])
precision_score = precision_score(communities_crime_df['highCrime_Boolean'], communities_crime_df['Predicted_HighCrime'])
accuracy_score = accuracy_score(communities_crime_df['highCrime_Boolean'], communities_crime_df['Predicted_HighCrime'])

print("Training Accuracy = {} Precision = {} Recall = {}".format(accuracy_score,precision_score,recall_score))

Training Accuracy = 0.9372804816859006 Precision = 0.9377431906614786 Recall = 0.964
