In [28]:
# Pandas
import pandas as pd

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2

# Encoders and model evaluation
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict

In [3]:
crimes = pd.read_csv("Chicago_Crimes_2008-2011.csv")

In [4]:
print(len(crimes))

638191


In [5]:
# Handle Missing Data
crimes = crimes.dropna()
print(len(crimes))

637795


In [6]:
crimes ['Domestic'].value_counts()

False    501277
True     136518
Name: Domestic, dtype: int64

In [8]:
crimes['Location Description'].value_counts()


6      120802
08B    119918
14      71244
26      69546
18      63940
5       38964
08A     29808
7       25671
3       22661
04B     17370
11      16705
04A      9868
15       5975
16       5307
24       4934
10       3044
20       3018
2        2595
17       2252
19       1507
22       1177
9         871
01A       336
13        177
12        105
Name: FBI Code, dtype: int64

In [9]:
crimes['Primary Type'].value_counts()

BATTERY                             137288
THEFT                               120802
CRIMINAL DAMAGE                      71244
NARCOTICS                            68174
OTHER OFFENSE                        45589
ASSAULT                              39342
BURGLARY                             38964
MOTOR VEHICLE THEFT                  25671
ROBBERY                              22661
DECEPTIVE PRACTICE                   20031
CRIMINAL TRESPASS                    15925
WEAPONS VIOLATION                     5975
OFFENSE INVOLVING CHILDREN            5396
PROSTITUTION                          5305
PUBLIC PEACE VIOLATION                5111
CRIM SEXUAL ASSAULT                   2232
SEX OFFENSE                           1899
GAMBLING                              1496
LIQUOR LAW VIOLATION                  1177
INTERFERENCE WITH PUBLIC OFFICER       970
ARSON                                  883
KIDNAPPING                             565
INTIMIDATION                           363
HOMICIDE   

In [10]:
crimes['FBI Code'].value_counts()

6      120802
08B    119918
14      71244
26      69546
18      63940
5       38964
08A     29808
7       25671
3       22661
04B     17370
11      16705
04A      9868
15       5975
16       5307
24       4934
10       3044
20       3018
2        2595
17       2252
19       1507
22       1177
9         871
01A       336
13        177
12        105
Name: FBI Code, dtype: int64

In [13]:
ld_vcts = crimes['Location Description'].value_counts()
to_remove = ld_vcts[ld_vcts < 5].index
crimes = crimes[~crimes['Location Description'].isin(to_remove)]
print(len(crimes))

637762


In [14]:
ld_vcts = crimes['Primary Type'].value_counts()
to_remove = ld_vcts[ld_vcts < 5].index
crimes = crimes[~crimes['Primary Type'].isin(to_remove)]
print(len(crimes))

637761


In [16]:
ld_vcts = crimes['FBI Code'].value_counts()
to_remove = ld_vcts[ld_vcts < 5].index
crimes = crimes[~crimes['FBI Code'].isin(to_remove)]
print(len(crimes))

637761


In [17]:
X = crimes.drop(['Domestic'], axis =1)
y = crimes['Domestic']
print(X.head())
print(y)

  Primary Type          Description Location Description  Arrest  Beat  \
0     HOMICIDE  FIRST DEGREE MURDER                ALLEY    True   323   
1     HOMICIDE  FIRST DEGREE MURDER               STREET    True  1533   
2     HOMICIDE  FIRST DEGREE MURDER        PARK PROPERTY   False   831   
3     HOMICIDE  FIRST DEGREE MURDER           RESTAURANT   False  1524   
4     HOMICIDE  FIRST DEGREE MURDER               STREET    True  1034   

   District  Ward  Community Area FBI Code  
0       3.0   6.0            69.0      01A  
1      15.0  24.0            25.0      01A  
2       8.0  18.0            66.0      01A  
3      15.0  37.0            25.0      01A  
4      10.0  25.0            31.0      01A  
0         False
1         False
2         False
3         False
4         False
          ...  
638186     True
638187     True
638188     True
638189     True
638190     True
Name: Domestic, Length: 637761, dtype: bool


In [18]:
# encode the data
oe = OrdinalEncoder()
oe.fit(X)
X_enc = oe.transform(X)

le = LabelEncoder()
le.fit(y)
y_enc = le.transform(y)


In [19]:
# Feature Selection
selector = SelectKBest (chi2, k=5)
newX = selector.fit_transform(X_enc, y_enc)

cols = selector.get_support(indices = True)
print(cols)
print(X.iloc[:,cols])

[0 1 2 4 8]
         Primary Type                  Description     Location Description  \
0            HOMICIDE          FIRST DEGREE MURDER                    ALLEY   
1            HOMICIDE          FIRST DEGREE MURDER                   STREET   
2            HOMICIDE          FIRST DEGREE MURDER            PARK PROPERTY   
3            HOMICIDE          FIRST DEGREE MURDER               RESTAURANT   
4            HOMICIDE          FIRST DEGREE MURDER                   STREET   
...               ...                          ...                      ...   
638186        BATTERY      DOMESTIC BATTERY SIMPLE                APARTMENT   
638187        ASSAULT          AGGRAVATED: HANDGUN                    ALLEY   
638188        BATTERY      DOMESTIC BATTERY SIMPLE  RESIDENCE PORCH/HALLWAY   
638189  OTHER OFFENSE  VIOLATE ORDER OF PROTECTION   VEHICLE NON-COMMERCIAL   
638190        BATTERY      DOMESTIC BATTERY SIMPLE                APARTMENT   

        Beat FBI Code  
0        323   

In [21]:
X_train, X_test, y_train, y_test = train_test_split(newX, y_enc, test_size = 0.3)

In [23]:
# Logistic Regression
logreg = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced')
logreg.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [25]:
y_pred_log = logreg.predict(X_test)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log, target_names =['Not Domestic', 'Domestic']))


[[112207  38427]
 [ 10929  29766]]
              precision    recall  f1-score   support

Not Domestic       0.91      0.74      0.82    150634
    Domestic       0.44      0.73      0.55     40695

    accuracy                           0.74    191329
   macro avg       0.67      0.74      0.68    191329
weighted avg       0.81      0.74      0.76    191329



In [None]:
scv = StratifiedKFold(n_splits = 5)
crosspred = cross_val_predict(logreg, newX, y_enc, cv= scv)
print(confustion_matrix(y_test,crosspred))
print(classification_report(y_test,cross_pred))

In [30]:
#Random Forest
randfor = RandomForestClassifier(n_estimators=10, class_weight= 'balanced', verbose= True)


In [32]:
randfor.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.9s finished


RandomForestClassifier(class_weight='balanced', n_estimators=10, verbose=True)

In [33]:
y_pred_rf = randfor.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished


In [34]:
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names =['Not Domestic', 'Domestic']))


[[134839  15795]
 [  5798  34897]]
              precision    recall  f1-score   support

Not Domestic       0.96      0.90      0.93    150634
    Domestic       0.69      0.86      0.76     40695

    accuracy                           0.89    191329
   macro avg       0.82      0.88      0.84    191329
weighted avg       0.90      0.89      0.89    191329



In [35]:
nb = GaussianNB()
sweight = compute_sample_weight(class_weight = 'balanced', y= y_train)
nb.fit(X_train, y_train, sweight)


GaussianNB()

In [36]:
y_pred = nb.predict(X_test)


In [37]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names =['Not Domestic', 'Domestic']))


[[106157  44477]
 [ 10470  30225]]
              precision    recall  f1-score   support

Not Domestic       0.91      0.70      0.79    150634
    Domestic       0.40      0.74      0.52     40695

    accuracy                           0.71    191329
   macro avg       0.66      0.72      0.66    191329
weighted avg       0.80      0.71      0.74    191329

