#### Run full notebook as-is for initial model results

In [35]:
import requests
import pymongo
from flask import Flask, render_template, jsonify
from bson.json_util import dumps
from config import USER, PASSWORD
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [2]:
conn = f'mongodb+srv://{USER}:{PASSWORD}@weatherviz-andy-5dubo.mongodb.net/gtds_p3?retryWrites=true'
client = pymongo.MongoClient(conn)
db = client.gtds_p3

In [3]:
# household_total = 0
# for i in range(1993,2018):
#     url = f' https://api.bjs.ojp.gov/bjs/ncvs/v2/household/{i}?format=json'
#     res = requests.get(url).json()
#     data = res['householdData']
#     household_total += len(res['householdData'])
#     print(f"year: {i}, records: {len(res['householdData'])}")
#     db.household.insert_many(data)
    
# print(f'total records: {household_total}')

In [4]:
from pprint import pprint
records = db.population.find()
df = pd.DataFrame(list(records))

In [37]:
df_trimmed = df.iloc[:, [0,3,4,5,6,7,9,11,13,14,16,17,19,20,44]]

df_renamed = df_trimmed.rename(columns={
    'Age': 'age',
    'Household income': 'hh_income',
    'Location of incident':'incident_loc',
    'Injury':'injury',
    'Location of residence':'residence_loc',
    'Marital status':'marital_status',
    'Population size':'pop_size',
    'Race':'race',
    'Region':'region',
    'Reporting to the police':'police_report',
    'Sex':'sex',
    'Type of crime':'type_crime',
    'Victim-offender relationship':'vo_relationship',
    'Weapon category':'tw'
})
df_renamed.count()

age                43200
hh_income          43200
injury             43200
incident_loc       43200
residence_loc      43200
marital_status     43200
pop_size           43200
race               43200
region             43200
police_report      43200
sex                43200
type_crime         43200
vo_relationship    43200
tw                 43200
year               43200
dtype: int64

In [38]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

In [39]:
X = df_renamed.drop(columns=['sex'], axis=1)
y = df_renamed['sex']
print(X.shape, y.shape)

(43200, 14) (43200,)


In [40]:
X_categorical = pd.get_dummies(X)

In [50]:
new_cols = []
for x in X_categorical.columns:
    x = x.lower().split(' ')
    x = '_'.join(x)
   
    if(',' in x):
        x = x.split(',')
        x = ''.join(x)
        
    if('$' in x):
        x = x.split('$')
        x = ''.join(x)

    if("'" in x):
        x = x.split("'")
        x = ''.join(x)
        
    new_cols.append(x)
    
X_categorical.columns = new_cols

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_categorical, y, random_state=1, stratify=y)

In [43]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [44]:
model = Sequential()

In [45]:
model.add(Dense(units=100, activation='relu', input_dim=65))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [46]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 1s - loss: 0.6091 - acc: 0.6700
Epoch 2/100
 - 1s - loss: 0.5833 - acc: 0.6914
Epoch 3/100
 - 1s - loss: 0.5699 - acc: 0.7020
Epoch 4/100
 - 1s - loss: 0.5585 - acc: 0.7112
Epoch 5/100
 - 1s - loss: 0.5465 - acc: 0.7187
Epoch 6/100
 - 1s - loss: 0.5322 - acc: 0.7300
Epoch 7/100
 - 1s - loss: 0.5179 - acc: 0.7375
Epoch 8/100
 - 1s - loss: 0.5043 - acc: 0.7508
Epoch 9/100
 - 1s - loss: 0.4896 - acc: 0.7590
Epoch 10/100
 - 1s - loss: 0.4747 - acc: 0.7687
Epoch 11/100
 - 1s - loss: 0.4623 - acc: 0.7774
Epoch 12/100
 - 1s - loss: 0.4495 - acc: 0.7836
Epoch 13/100
 - 1s - loss: 0.4378 - acc: 0.7936
Epoch 14/100
 - 1s - loss: 0.4263 - acc: 0.7995
Epoch 15/100
 - 1s - loss: 0.4137 - acc: 0.8054
Epoch 16/100
 - 1s - loss: 0.4039 - acc: 0.8111
Epoch 17/100
 - 1s - loss: 0.3936 - acc: 0.8164
Epoch 18/100
 - 1s - loss: 0.3838 - acc: 0.8225
Epoch 19/100
 - 1s - loss: 0.3755 - acc: 0.8265
Epoch 20/100
 - 1s - loss: 0.3645 - acc: 0.8345
Epoch 21/100
 - 1s - loss: 0.3546 - acc: 0.8387
E

<keras.callbacks.History at 0x1b2a2494b70>

In [47]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 2.19990688306314, Accuracy: 0.649537037037037


In [48]:
encoded_predictions = model.predict_classes(X_test_scaled[:100])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

  if diff:


In [57]:
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test[:100])}")
pd.DataFrame({'predicted':prediction_labels, 'actual':list(y_test[:100])})

Unnamed: 0,predicted,actual
0,Female,Female
1,Male,Female
2,Female,Female
3,Female,Male
4,Female,Female
5,Male,Female
6,Female,Female
7,Female,Male
8,Male,Male
9,Male,Male
