#### Run full notebook as-is for initial model results

In [3]:
import requests
import pymongo
from flask import Flask, render_template, jsonify
from bson.json_util import dumps
from config import USER, PASSWORD
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [4]:
conn = f'mongodb+srv://{USER}:{PASSWORD}@weatherviz-andy-5dubo.mongodb.net/gtds_p3?retryWrites=true'
client = pymongo.MongoClient(conn)
db = client.gtds_p3

In [3]:
# household_total = 0
# for i in range(1993,2018):
#     url = f' https://api.bjs.ojp.gov/bjs/ncvs/v2/household/{i}?format=json'
#     res = requests.get(url).json()
#     data = res['householdData']
#     household_total += len(res['householdData'])
#     print(f"year: {i}, records: {len(res['householdData'])}")
#     db.household.insert_many(data)
    
# print(f'total records: {household_total}')

In [5]:
from pprint import pprint
records = db.population.find()
df = pd.DataFrame(list(records))

In [6]:
df_trimmed = df.iloc[:, [0,3,4,5,6,7,9,11,13,14,16,17,19,20,44]]

df_renamed = df_trimmed.rename(columns={
    'Age': 'age',
    'Household income': 'hh_income',
    'Location of incident':'incident_loc',
    'Injury':'injury',
    'Location of residence':'residence_loc',
    'Marital status':'marital_status',
    'Population size':'pop_size',
    'Race':'race',
    'Region':'region',
    'Reporting to the police':'police_report',
    'Sex':'sex',
    'Type of crime':'type_crime',
    'Victim-offender relationship':'vo_relationship',
    'Weapon category':'tw'
})
df_renamed.count()

age                43200
hh_income          43200
injury             43200
incident_loc       43200
residence_loc      43200
marital_status     43200
pop_size           43200
race               43200
region             43200
police_report      43200
sex                43200
type_crime         43200
vo_relationship    43200
tw                 43200
year               43200
dtype: int64

In [7]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
from sklearn import tree

In [8]:
# df_renamed.head()

In [10]:
# X = df_renamed.drop(columns=['sex'], axis=1)
# y = df_renamed['sex']
# print(X.shape, y.shape)

df_sm = df_renamed[['sex','age','type_crime','vo_relationship']]

In [11]:
X = df_sm.drop(columns=['type_crime'], axis=1)
y = df_sm['type_crime']
print(X.shape, y.shape)

(43200, 3) (43200,)


In [12]:
X_categorical = pd.get_dummies(X)
# X_categorical = X_categorical.drop(columns=['year'], axis=1)

In [13]:
new_cols = []
for x in X_categorical.columns:
    x = x.lower().split(' ')
    x = '_'.join(x)
   
    if(',' in x):
        x = x.split(',')
        x = ''.join(x)
        
    if('$' in x):
        x = x.split('$')
        x = ''.join(x)

    if("'" in x):
        x = x.split("'")
        x = ''.join(x)
        
    new_cols.append(x)
    
X_categorical.columns = new_cols
feature_names = X_categorical.columns
X_categorical.columns

Index(['age_12_to_14', 'age_15_to_17', 'age_18_to_20', 'age_21_to_24',
       'age_25_to_34', 'age_35_to_49', 'age_50_to_64', 'age_65_or_older',
       'type_crime_aggravated_assault', 'type_crime_personal_theft',
       'type_crime_rape/sexual_assault', 'type_crime_robbery',
       'type_crime_simple_assault',
       'vo_relationship_do_not_know_number_of_offenders',
       'vo_relationship_do_not_know_relationship', 'vo_relationship_intimates',
       'vo_relationship_other_relatives', 'vo_relationship_stranger',
       'vo_relationship_well-known/casual_acquaintances'],
      dtype='object')

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_categorical, y, random_state=1, stratify=y)

In [15]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6444444444444445

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6451851851851852

In [None]:
results = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
pd.DataFrame(results)

Unnamed: 0,0,1
0,0.328436,vo_relationship_intimates
1,0.167652,vo_relationship_stranger
2,0.158599,type_crime_rape/sexual_assault
3,0.045264,vo_relationship_well-known/casual_acquaintances
4,0.044587,vo_relationship_other_relatives
5,0.030426,type_crime_personal_theft
6,0.027947,age_12_to_14
7,0.027527,type_crime_aggravated_assault
8,0.027108,type_crime_simple_assault
9,0.021713,vo_relationship_do_not_know_relationship


### Neural Network

In [13]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [14]:
model = Sequential()

In [15]:
model.add(Dense(units=100, activation='relu', input_dim=64))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 3s - loss: 0.6113 - acc: 0.6683
Epoch 2/100
 - 3s - loss: 0.5933 - acc: 0.6848
Epoch 3/100
 - 3s - loss: 0.5840 - acc: 0.6935
Epoch 4/100
 - 3s - loss: 0.5735 - acc: 0.7009
Epoch 5/100
 - 3s - loss: 0.5634 - acc: 0.7106
Epoch 6/100
 - 3s - loss: 0.5527 - acc: 0.7178
Epoch 7/100
 - 3s - loss: 0.5403 - acc: 0.7273
Epoch 8/100
 - 3s - loss: 0.5279 - acc: 0.7369
Epoch 9/100
 - 3s - loss: 0.5106 - acc: 0.7468
Epoch 10/100
 - 3s - loss: 0.4973 - acc: 0.7574
Epoch 11/100
 - 3s - loss: 0.4813 - acc: 0.7650
Epoch 12/100
 - 3s - loss: 0.4651 - acc: 0.7760
Epoch 13/100
 - 3s - loss: 0.4490 - acc: 0.7863
Epoch 14/100
 - 3s - loss: 0.4338 - acc: 0.7960
Epoch 15/100
 - 3s - loss: 0.4190 - acc: 0.8000
Epoch 16/100
 - 3s - loss: 0.4049 - acc: 0.8080
Epoch 17/100
 - 3s - loss: 0.3867 - acc: 0.8175
Epoch 18/100
 - 3s - loss: 0.3758 - acc: 0.8225
Epoch 19/100
 - 3s - loss: 0.3601 - acc: 0.8312
Epoch 20/100
 - 3s - loss: 0.3500 - acc: 0.8366
Epoch 21/100
 - 3s - loss: 0.3358 - acc: 0.8429
E

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled[:100])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test[:100])}")
pd.DataFrame({'predicted':prediction_labels, 'actual':list(y_test[:100])})