In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import random
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

In [None]:
# load data
train_set = pd.DataFrame.from_csv('sqf_train_cpw.csv', index_col = False)
test_set = pd.DataFrame.from_csv('sqf_test_cpw.csv', index_col = False)


In [None]:
# join and re-split data to get all categories
train_set['set'] = 'train'
test_set['set'] = 'test'
joined_data = train_set.append(test_set)

# select all non-real-valued columns (besides 'set' and 'id') and convert to one-hot encoding
col_names = joined_data.columns
col_names = col_names.difference(['id', 'set', 'suspect.age', 'suspect.weight', 'suspect.height', 'observation.period'])
joined_data = pd.get_dummies(data=joined_data, columns=col_names, sparse=True)


In [None]:
# remove redundant columns (binary columns of the form 'variable_False')
redundant_cols = []
for name in list(joined_data):
    if "False" in name:
        redundant_cols.append(name)
joined_data.drop(redundant_cols, inplace=True, axis=1)


In [None]:
# split data again
train = joined_data.loc[joined_data['set'] == 'train']
train = train.drop(['set'], axis=1)
test = joined_data.loc[joined_data['set'] == 'test']
test = test.drop(['set'], axis=1)

In [None]:
# split training data into features and outcome (numpy arrays, to feed to sklearn algorithms)
label_train = np.ravel(train[['found.weapon_True']].values)
pred_train = train.drop(['id', 'arrested_True', 'found.weapon_True', 'found.gun_True'], axis=1)
pred_train = pred_train.values 


In [None]:
# format test data
results = test.copy()
label_test = np.ravel(test[['found.weapon_True']].values)
pred_test = test.drop(['id', 'found.weapon_True', 'arrested_True', 'found.gun_True'], axis=1)
feature_names = list(pred_test.columns.values)
pred_test = pred_test.values 

In [None]:
# fit an L1 penalized logistic regression model
logit_classifier = LogisticRegression(penalty="l1", solver='liblinear', verbose=2)
logit_classifier.fit(X=pred_train, y=label_train)

logit_predictions = logit_classifier.predict_proba(pred_test)[:, 1]
results['preds'] = logit_predictions
print roc_auc_score(label_test, logit_predictions)

logit_predictions_class = logit_classifier.predict(pred_test)
print accuracy_score(label_test, logit_predictions_class, normalize=True)

In [None]:
# for different threshold values, select the features which remain in the model
model = SelectFromModel(logit_classifier, prefit=True, threshold=.5)
pred_test_new = model.transform(pred_test)

#pred_test.shape
#pred_test_new.shape
#print model.get_support()


In [None]:
feature_importances = list(model.get_support())
feature_list = []
for i in range(0,len(feature_names)):
    feature_list.append((feature_names[i], feature_importances[i]))

#get the names of the features from the previous cell
#print sorted(feature_list, reverse=True, key=lambda x: x[1])

In [None]:
# train a new model based just on features selected above
#predictors = ['location.housing_housing', 'location.housing_neither', 'stopped.bc.object_True', 'stopped.bc.bulge_True', 'additional.sights_True']
#pred_train = train[predictors]
#pred_train = pred_train.values 
#pred_test = test[predictors]
#feature_names = list(pred_test.columns.values)
#pred_test = pred_test.values 


# How does AUC change as you add/subtract features?

#logit_classifier_simple = LogisticRegression(solver='liblinear', verbose=2)
#logit_classifier_simple.fit(X=pred_train, y=label_train)

#logit_predictions_simple = logit_classifier_simple.predict_proba(pred_test)[:, 1]
#results['preds_simple'] = logit_predictions_simple
#print roc_auc_score(label_test, logit_predictions_simple)

#print logit_classifier_simple.coef_



In [None]:
# add a column which contains the unit weighted heuristic score derived from the features from the previous cell

In [None]:
# Plotting question:
# Make a recovery plot: if you used the logistic model to rank stops by model-predicted likelihood of weapon recovery, 
# from highest to lowest, what percent of weapons would you recover if you made the best x percent of stops?
# The plot should have percent of stops on the x axis and percent weapons recovered on the y axis

# HINTS:
# 1) order results by column 'preds'
#results = results.sort(['preds'], ascending=False)
# 2) add a column to results which is the cumulative sum of found.weapon_True
#plot_data = results[['found.weapon_True', 'preds']]
#plot_data['weap_sum'] = plot_data['found.weapon_True'].cumsum()
# 3) use the above cumulative sum to make a column which shows percent weapons recovered
#plot_data['weap_perc'] = 100*plot_data['weap_sum']/plot_data['found.weapon_True'].sum()
# 4) add a column which counts the stops
#s = [j for j in range(1,296522)]
#plot_data['nstop'] = s
# 5) use the above stop count column to make a column which shows percent of all stops
#plot_data['stop_perc'] = 100*plot_data['nstop']/plot_data.shape[0]
# 6) restrict to just the columns from 3) and 5), downsample to maybe 1000 rows
#plot_data = plot_data[['stop_perc', 'weap_perc']]
#rows = random.sample(plot_data.index, 1000)
#plot_data = plot_data.ix[rows]
# 7) sort everything in ascending order by the column from 5), then plot.
#plot_data = plot_data.sort(['stop_perc'], ascending=True)
#plt.figure()
#plot_data.plot(x='stop_perc', y='weap_perc')



###### how does the heuristic model perform??


In [None]:
o