# Feature Importance

In [None]:
import requests
import pymongo
from flask import Flask, render_template, jsonify
from bson.json_util import dumps
from config import USER, PASSWORD
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [None]:
conn = f'mongodb+srv://{USER}:{PASSWORD}@weatherviz-andy-5dubo.mongodb.net/gtds_p3?retryWrites=true'
client = pymongo.MongoClient(conn)
db = client.gtds_p3

In [None]:
from pprint import pprint
records = db.population.find()
df = pd.DataFrame(list(records))

In [None]:
df_trimmed = df.iloc[:, [0,3,4,5,6,7,9,11,13,14,16,17,19,20,44]]

df_renamed = df_trimmed.rename(columns={
    'Age': 'age',
    'Household income': 'hh_income',
    'Location of incident':'incident_loc',
    'Injury':'injury',
    'Location of residence':'residence_loc',
    'Marital status':'marital_status',
    'Population size':'pop_size',
    'Race':'race',
    'Region':'region',
    'Reporting to the police':'police_report',
    'Sex':'sex',
    'Type of crime':'type_crime',
    'Victim-offender relationship':'vo_relationship',
    'Weapon category':'tw'
})
df_renamed.count()

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
from sklearn import tree
from sklearn import preprocessing

In [None]:
df_sm = df_renamed[['sex','age','type_crime','vo_relationship']]
le = preprocessing.LabelEncoder()

le.fit(df_sm['vo_relationship'])
test = le.fit_transform(df_sm['vo_relationship'])
df_sm['vo_relationship_num'] = list(test)

le.fit(df_sm['age'])
test = le.fit_transform(df_sm['age'])
df_sm['age_num'] = list(test)

le.fit(df_sm['type_crime'])
test = le.fit_transform(df_sm['type_crime'])
df_sm['type_crime_num'] = list(test)

le.fit(df_sm['sex'])
test = le.fit_transform(df_sm['sex'])
df_sm['sex_num'] = list(test)

In [None]:
df_sm.head()

In [None]:
df_categorical = df_sm.iloc[:,[4,5,6,7]]
X = df_categorical.drop(columns=['age_num'], axis=1)
y = df_categorical['age_num']
feature_names = X.columns
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
results = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
df_to_send = pd.DataFrame(results)
display(df_to_send)
# df_to_send.to_csv('predict_crimetype.csv', index=False)

### Neural Network

In [None]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
model = Sequential()

In [None]:
model.add(Dense(units=100, activation='relu', input_dim=64))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled[:100])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
# print(f"Predicted classes: {prediction_labels}")
# print(f"Actual Labels: {list(y_test[:100])}")
pd.DataFrame({'predicted':prediction_labels, 'actual':list(y_test[:100])})

### Cross validation

In [1]:
import requests
import pymongo
import pandas
import numpy
import pandas as pd
import tensorflow as tf
from pprint import pprint

from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import tree, preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

from config import USER, PASSWORD

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def create_baseline():
	# create model
	model = Sequential()
	model.add(Dense(11, input_dim=11, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
	return model

In [14]:
def create_smaller():
	# create model
	model = Sequential()
	model.add(Dense(6, input_dim=11, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [4]:
conn = f'mongodb+srv://{USER}:{PASSWORD}@weatherviz-andy-5dubo.mongodb.net/gtds_p3?retryWrites=true'
client = pymongo.MongoClient(conn)
db = client.gtds_p3

In [5]:
seed = 7
numpy.random.seed(seed)
raw_data = db.raw_data.find({}, {'_id': False})
dfv2 = pd.DataFrame(list(raw_data))
dfv2 = dfv2.dropna()
dfv2.count()

In [10]:
dataset = dfv2.values
# split into input (X) and output (Y) variables
X = dfv2.drop(columns=['type_crime'], axis=1)
y = dfv2['type_crime']
X.shape, y.shape

((54836, 11), (54836,))

In [11]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
y_encoded.shape

(54836,)

In [12]:
# # create_baseline() with standardized dataset
# numpy.random.seed(seed)
# estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))
# pipeline = Pipeline(estimators)
# kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)
# results = cross_val_score(pipeline, X, y_encoded, cv=kfold)
# print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [13]:
# create_smaller() with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_smaller, epochs=10, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, y_encoded, cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 86.34% (0.40%)


In [27]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = dfv2 #pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('type_crime', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['type_crime'].values, random_state=42)

# Average CV score on the training set was:0.7052474033424099
exported_pipeline = make_pipeline(
    OneHotEncoder(minimum_fraction=0.2, sparse=False, threshold=10),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.05, min_samples_leaf=1, min_samples_split=7, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [None]:
encoder.fit(dfv2['vo_relationship'])
test = encoder.fit_transform(dfv2['vo_relationship'])
dfv2['vo_relationship_num'] = list(test)

encoder.fit(dfv2['age'])
test = encoder.fit_transform(dfv2['age'])
dfv2['age_num'] = list(test)

encoder.fit(dfv2['type_crime'])
test = encoder.fit_transform(dfv2['type_crime'])
dfv2['type_crime_num'] = list(test)

encoder.fit(dfv2['sex'])
test = encoder.fit_transform(dfv2['sex'])
dfv2['sex_num'] = list(test)