In [3]:
% matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

death_records = pd.read_csv('DeathRecords.csv')
death_records_columns = death_records.columns
death_records = death_records[death_records['EducationReportingFlag'] == 1]
death_records = death_records.drop(['EducationReportingFlag','Education1989Revision'], axis=1)

In [62]:
gun_homicides = death_records[death_records['CauseRecode113']==128]
gun_suicides = death_records[death_records['CauseRecode113']==125]

gun_deaths = pd.concat([gun_homicides,gun_suicides])
gun_deaths['CauseRecode113'] = gun_deaths['CauseRecode113'].map({128: 1, 125: 0})
gun_deaths = gun_deaths.drop(['CauseRecode39','CauseRecode358',
                              'InfantCauseRecode130','PlaceOfDeathAndDecedentsStatus',
                             'MethodOfDisposition','Autopsy',
                             'NumberOfEntityAxisConditions',
                             'NumberOfRecordAxisConditions','AgeRecode27',
                             'AgeRecode52','AgeSubstitutionFlag','AgeType',
                             'InfantAgeRecode22','CurrentDataYear',
                             'MannerOfDeath','Icd10Code','Age'],axis=1)
gun_deaths=gun_deaths.rename(columns = {'CauseRecode113':'Homicide_1',
                                        'Education2003Revision':'Edu',
                                       'MonthOfDeath':'DeathMonth',
                                        'DayOfWeekOfDeath':'DeathDay',
                                        'AgeRecode12': 'Age'
                                       })

gun_deaths = gun_deaths[gun_deaths['RaceImputationFlag']==0]

gun_deaths = gun_deaths.drop(['RaceImputationFlag'],axis=1)
gun_deaths = gun_deaths[gun_deaths['InjuryAtWork'] != 'U']
gun_deaths['InjuryAtWork'] = gun_deaths['InjuryAtWork'].map({'Y': 1, 'N': 0})
gun_deaths['Sex'] = gun_deaths['Sex'].map({'M':0, 'F':1})
gun_deaths = gun_deaths[gun_deaths['BridgedRaceFlag']==0]
gun_deaths = gun_deaths.drop('BridgedRaceFlag',axis=1)
gun_deaths = gun_deaths[gun_deaths['Edu'] < 9]
gun_deaths = gun_deaths[gun_deaths['MaritalStatus'] != 'U']
gun_deaths = gun_deaths[gun_deaths['Age'] < 12]
gun_deaths = gun_deaths[gun_deaths['HispanicOrigin'] < 996]
gun_deaths = gun_deaths[gun_deaths['HispanicOriginRaceRecode'] < 9]
gun_deaths['Intercepts'] = np.ones(len(gun_deaths))

resident_dummies = pd.get_dummies(gun_deaths['ResidentStatus'], prefix='ResidentStatus')
#residence bassline is 1- resident of state/county where death occurred
edu_dummies = pd.get_dummies(gun_deaths['Edu'], prefix='Edu')
#edu bassline is 1- 8th grade or less
age_dummies = pd.get_dummies(gun_deaths['Age'], prefix='Age')
#age bassline is 1- less than a year old
marital_dummies = pd.get_dummies(gun_deaths['MaritalStatus'], prefix='MaritalStatus')
#marital bassline is D-divorced
injury_dummies = pd.get_dummies(gun_deaths['PlaceOfInjury'], prefix='PlaceOfInjury')
#injury bassline is 0- occurred at home
activity_dummies = pd.get_dummies(gun_deaths['ActivityCode'],prefix='ActivityCode')
#activity bassline is 0- occurred during sports
race_dummies = pd.get_dummies(gun_deaths['Race'],prefix='Race')
#race bassline is 1-white
race3_dummies = pd.get_dummies(gun_deaths['RaceRecode3'],prefix='RaceRecode3')
#race3 bassline is 1-white
race5_dummies = pd.get_dummies(gun_deaths['RaceRecode5'],prefix='RaceRecode5')
#race5 bassline is 1-White
hisp_dummies = pd.get_dummies(gun_deaths['HispanicOrigin'],prefix='HispanicOrigin')
#hispanic bassline is 100-nonhispanic
hisprecode_dummies = pd.get_dummies(gun_deaths['HispanicOriginRaceRecode'],prefix='HispanicOriginRaceRecode')
#hispanicrecode bassline is 1-mexican


cols_to_keep = ['Id', 'Sex', 'Intercepts', 'DeathMonth', 'DeathDay', 'Homicide_1']
dummy_data = gun_deaths[cols_to_keep].join(resident_dummies.ix[:, 'ResidentStatus_2':])
dummy_data = dummy_data.join(edu_dummies.ix[:,'Edu_2':])
dummy_data = dummy_data.join(age_dummies.ix[:,'Age_2':])
dummy_data = dummy_data.join(marital_dummies.ix[:,'MaritalStatus_M':])
dummy_data = dummy_data.join(injury_dummies.ix[:,'PlaceOfInjury_2':])
dummy_data = dummy_data.join(activity_dummies.ix[:,'ActivityCode_2':])
dummy_data = dummy_data.join(race_dummies.ix[:,'Race_2':])
dummy_data = dummy_data.join(race3_dummies.ix[:,'RaceRecode3_2':])
dummy_data = dummy_data.join(race5_dummies.ix[:,'RaceRecode5_2':])
dummy_data = dummy_data.join(hisp_dummies.ix[:,'HispanicOrigin_200':])
dummy_data = dummy_data.join(hisprecode_dummies.ix[:,'HispanicOriginRaceRecode_2':])

print(len(gun_suicides),len(gun_homicides), len(dummy_data))

19229 10020 27052


In [63]:
from sklearn.cross_validation import train_test_split as tts

features_list = ['Intercepts', 'ResidentStatus_2','ResidentStatus_3', 'ResidentStatus_4', 
'Edu_2', 'Edu_3', 'Edu_4', 'Edu_5', 'Edu_6', 'Edu_7', 'Edu_8',
'Age_2', 'Age_3', 'Age_4', 'Age_5', 'Age_6', 'Age_7', 'Age_8','Age_9', 'Age_10', 'Age_11',
'MaritalStatus_M', 'MaritalStatus_S', 'MaritalStatus_W',
'PlaceOfInjury_2', 'PlaceOfInjury_3', 'PlaceOfInjury_4', 'PlaceOfInjury_5',
'PlaceOfInjury_6', 'PlaceOfInjury_7', 'PlaceOfInjury_8', 'PlaceOfInjury_9',
'ActivityCode_2', 'ActivityCode_4', 'ActivityCode_8', 'ActivityCode_9', 
'HispanicOriginRaceRecode_2', 'HispanicOriginRaceRecode_3',
'HispanicOriginRaceRecode_4', 'HispanicOriginRaceRecode_5',
'HispanicOriginRaceRecode_6', 'HispanicOriginRaceRecode_7',
'HispanicOriginRaceRecode_8']

X = dummy_data[['Intercepts','Sex', 'ResidentStatus_2','ResidentStatus_3', 'ResidentStatus_4', 
'Edu_2', 'Edu_3', 'Edu_4', 'Edu_5', 'Edu_6', 'Edu_7', 'Edu_8',
'Age_2', 'Age_3', 'Age_4', 'Age_5', 'Age_6', 'Age_7', 'Age_8','Age_9', 'Age_10', 'Age_11',
'MaritalStatus_M', 'MaritalStatus_S', 'MaritalStatus_W',
'PlaceOfInjury_2', 'PlaceOfInjury_3', 'PlaceOfInjury_4', 'PlaceOfInjury_5',
'PlaceOfInjury_6', 'PlaceOfInjury_7', 'PlaceOfInjury_8', 'PlaceOfInjury_9',
'ActivityCode_2', 'ActivityCode_4', 'ActivityCode_8', 'ActivityCode_9', 
'HispanicOriginRaceRecode_2', 'HispanicOriginRaceRecode_3',
'HispanicOriginRaceRecode_4', 'HispanicOriginRaceRecode_5',
'HispanicOriginRaceRecode_6', 'HispanicOriginRaceRecode_7',
'HispanicOriginRaceRecode_8']]



x_vars = ['Intercepts','Sex', 'SameState','SameCountry', 'ForeignBorn', 
'9_12grade', 'HS_grad', 'CollegeCredit', 'Assoc_Deg', 'Bach_Deg', 'Master_Deg', 'Doctorate',
'1_4_yr', '5_14_yr', '15_24_yr', '25_34_yr', '35_44_yr',
          '45_54_yr', '55_64_yr','65_74_yr','75_84_yr', '85_plus_yr',
'Married', 'Single', 'Widowed',
'school', 'sports_area', 'road', 'trade_area',
'industrial_area', 'farm', 'other', 'unknown',
'working', 'resting', 'other', 'unknown', 
'puerto_rican', 'cuban',
'central_south_american', 'other_unknown_hisp',
'white', 'black',
'other']

y = dummy_data['Homicide_1']

x_train, x_test, y_train, y_test = tts(X,y, test_size=0.30, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logit_model = LogisticRegression()
logit_train_results = logit_model.fit(x_train, y_train)
logit_test_results = logit_model.predict(x_test)
logit_test_prob = logit_model.predict_proba(x_test)
logit_model.score(x_test, y_test)

print(metrics.accuracy_score(y_test, logit_test_results))
print(metrics.roc_auc_score(y_test, logit_test_prob[:, 1]))

print(metrics.confusion_matrix(y_test, logit_test_results))
print(metrics.classification_report(y_test, logit_test_results))

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

coefs = list(logit_train_results.coef_[0])

var_coefs = []

for i in range(len(x_vars)):
    vc = str(x_vars[i]) + ": " + str(coefs[i])
    print(vc)
    var_coefs.append(vc)
    i+=1


0.857565303105
0.903800278006
[[4923  430]
 [ 726 2037]]
             precision    recall  f1-score   support

          0       0.87      0.92      0.89      5353
          1       0.83      0.74      0.78      2763

avg / total       0.86      0.86      0.86      8116

[ 0.84331116  0.86289727  0.85809313  0.88913525  0.87620103  0.85175601
  0.85249538  0.86871302  0.81360947  0.85465976]
0.857087147668
Intercepts: -0.0936913449694
Sex: 0.917405507831
SameState: 0.156666735575
SameCountry: 0.198925445639
ForeignBorn: 1.22444707174
9_12grade: 0.0490647998844
HS_grad: -0.462435221749
CollegeCredit: -0.83526615299
Assoc_Deg: -0.798807361804
Bach_Deg: -1.26339755484
Master_Deg: -1.23036426837
Doctorate: -1.3609967354
1_4_yr: 2.43174725116
5_14_yr: 0.0408614977208
15_24_yr: 0.144353582984
25_34_yr: 0.321508880799
35_44_yr: 0.141870051064
45_54_yr: -0.234511887746
55_64_yr: -0.607945744032
65_74_yr: -0.914787564819
75_84_yr: -1.22375671313
85_plus_yr: -1.5679523081
Married: -0.03862800981

In [58]:
full = X.join(y).drop('Intercepts',axis=1)
comparative_data = pd.melt(full)
comparative_data
freq_data = pd.crosstab(index=[comparative_data['value']], columns=[comparative_data['variable']])

freq_data = freq_data.T
freq_data.to_csv('deaths_comparison.csv')

In [59]:
freq_data

value,0.0,1.0
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ActivityCode_2,27051,1
ActivityCode_4,27034,18
ActivityCode_8,27018,34
ActivityCode_9,59,26993
Age_10,25477,1575
Age_11,26268,784
Age_2,27014,38
Age_3,26766,286
Age_4,22088,4964
Age_5,21954,5098
