# Goal: to build a model that predicts if a weapon was used or not based on the attributes of that crime

This will use a binary outcome of true or false. I will try using bayesian model, logistic regression, random forrest.

In [1]:
import pandas as pd
import numpy as np
import pyspark
import numpy
from pyspark.sql import functions as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix



%matplotlib inline

In [2]:
df = pd.read_csv('clean_crime_data.csv')

I am not sure how many crime involved the use of a weapon. Lets look:

In [3]:
len(df[df.firearm_used_flag >=1]) # this feels like a cumbersome approach. Lets do something cool

401

In [4]:
spark = pyspark.sql.SparkSession.builder.appName('pandasToSparkDF').getOrCreate()

# create spark dataframes
crime_df = spark.createDataFrame(df)

crime_df.createOrReplaceTempView('crime')

print('crimes where weapon was used')

gun_crimes = spark.sql("""
select 
    count(distinct crime_id) as crime_count,
    description
from crime
where firearm_used_flag >= 1
and description not LIKE '%Weapons%'
group by 2 order by 1 desc
""")

gun_crimes.show()

print('all crimes')

all_crimes = spark.sql("""
select
count(distinct crime_id) as crime_count
from crime
""")

all_crimes.show()

#print('Weapons were used in {}% of the crimes in this data set')

crimes where weapon was used
+-----------+--------------------+
|crime_count|         description|
+-----------+--------------------+
|         93|  Aggravated Assault|
|         55|Aggravated Assaul...|
|          8|Non Aggravated As...|
|          6|       Armed Robbery|
|          5|Non Aggravated As...|
|          2|  Strong Arm Robbery|
|          1|                Rape|
|          1|Kidnapping/Abduction|
|          1|            Homocide|
+-----------+--------------------+

all crimes
+-----------+
|crime_count|
+-----------+
|      30400|
+-----------+



In [5]:
print('Weapons were used in {}% of the crimes in this data set'.format(round((gun_crimes.groupBy().sum().collect()[0][0]/
                                                                            all_crimes.groupBy().sum().collect()[0][0]),3)*100))

Weapons were used in 0.6% of the crimes in this data set


Ok. This might seem pretty bad but actually there are a ton of types of crimes that we can exclude to narrow our focus and give this percentage a bit more of a fighting chance! 

In [6]:
gc = gun_crimes.toPandas()
gc.drop([6], axis= 0, inplace = True)

data = df[df.description.isin(gc.description.unique())]
data.reset_index(inplace = True,drop = True)
data = data.drop_duplicates(subset=['crime_id'], keep = False) 
data.firearm_used_flag = np.where(data.firearm_used_flag >= 1,1,0)
data.dvflag = np.where(data.dvflag >= 1,1,0)

In [7]:
print('now we have {}% of the crimes in this data set involving a firearm'.format(round((len(data[data.firearm_used_flag>=1])/
                                                                                 len(data[data.firearm_used_flag<1]))*100,2)))

now we have 2.37% of the crimes in this data set involving a firearm


this will be much better! 

# Naieve Bayes Classifer 

I will be using the Complement Naive Bayes (CNB) algorithm. CNB is an adaptation of the standard multinomial naive Bayes (MNB) algorithm that is particularly suited for imbalanced data sets. Given that we are trying to predict an event that only occurs ~2% of the time, this is a good choice.

https://www.youtube.com/watch?v=CPqOCI0ahss

This is a really good video of explaining how a Naieve Bayes model works at a high level. Its really pretty simple.

In [8]:
from sklearn.naive_bayes import ComplementNB

# split the data, will use this same data for other models 
model_df = data.loc[:,'dvflag':]
model_df_data = model_df.drop(columns=['firearm_used_flag'])
X_train, X_test, y_train, y_test = train_test_split(model_df_data,model_df['firearm_used_flag'],test_size = .15,
                                                    random_state = 42)
# train the model
model = ComplementNB().fit(X_train, y_train)
predicted = model.predict(X_test)

# put results to a confusion matrix
nb_results = pd.DataFrame(confusion_matrix(y_test, predicted), columns=['pred_no_gun','pred_gun'],
             index = ['actual_no_gun','actual_gun'])
nb_results

Unnamed: 0,pred_no_gun,pred_gun
actual_no_gun,715,368
actual_gun,12,13


###### Ok, this model is better than flipping a coin, but it is not that good. Lets break it down some:

In [9]:
def modelStats(results):
    accuracy = ((results.loc['actual_no_gun','pred_no_gun'] +results.loc['actual_gun','pred_gun'])/results.values.sum())*100
    mis_class = ((results.loc['actual_gun','pred_no_gun'] +results.loc['actual_no_gun','pred_gun'])/results.values.sum())*100
    true_pos = ((results.loc['actual_gun','pred_gun']/results.loc['actual_gun'].sum()))*100
    false_pos = ((results.loc['actual_no_gun','pred_gun']/results.loc['actual_no_gun'].sum()))*100
    true_neg = ((results.loc['actual_no_gun','pred_no_gun']/results.pred_no_gun.sum()))*100
    precision = ((results.loc['actual_gun','pred_gun']/results.pred_gun.sum()))*100
    prevalence = (results.loc['actual_gun'].sum()/results.values.sum())*100

    print('The model was {}% accuracte'.format(round(accuracy,2)))
    print('The model had a misclassification rate of {}%'.format(round(mis_class,2)))
    print('The model had a true positive rate of {}%'.format(round(true_pos,2)))
    print('The model had a false positive rate of {}%'.format(round(false_pos,2)))
    print('The model had a true negitive rate of {}%'.format(round(true_neg,2)))
    print('The model had a precision rate of {}%'.format(round(precision,2)))
    print('The model had a prevalence rate of {}%'.format(round(prevalence,2)))

In [10]:
modelStats(nb_results)

The model was 65.7% accuracte
The model had a misclassification rate of 34.3%
The model had a true positive rate of 52.0%
The model had a false positive rate of 33.98%
The model had a true negitive rate of 98.35%
The model had a precision rate of 3.41%
The model had a prevalence rate of 2.26%
