# Boston Traffic Stop Analysis
By. Andrew Simmons & Ethan Smith

In [None]:
from operator import itemgetter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
"""Load data and filter"""

dataset = pd.read_csv('data/boston-police-department-fio.csv')
dataset.replace('NO DATA ENTERED', np.NaN, inplace=True)
dataset['VEH_YEAR_NUM'] = dataset['VEH_YEAR_NUM'].replace(0, np.NaN)
dataset['AGE_AT_FIO_CORRECTED'] = dataset['AGE_AT_FIO_CORRECTED'].map({0: np.NaN, -1: np.NaN})

In [None]:
dataset.columns

## Data Visualization

In [None]:
"""Sex Distibution"""

males = len(dataset[dataset['SEX'] == 'MALE'].index)
females = len(dataset[dataset['SEX'] == 'FEMALE'].index)

sexes = ['Male', 'Female']
counts = np.array([males, females])

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.pie(counts,
                           explode=[0, 0.15],
                           shadow=True,
                           labels=sexes,
                           autopct="%1.1f%%")

ax.set_title('Sex Distribution')

In [None]:
"""Prior Offenses"""

has_prior_offenses = len(dataset[dataset['PRIORS'] == 'YES'].index)
no_prior_offenses = len(dataset[dataset['PRIORS'] == 'NO'].index)

labels = ['Yes', 'No']
counts = [has_prior_offenses, no_prior_offenses]

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.pie(counts,
                           explode=[0, 0.15],
                           shadow=True,
                           labels=labels,
                           autopct="%1.1f%%")

ax.set_title('Prior Offenses Distribution')

In [None]:
"""Terrorism Distribution"""

terrorism = len(dataset[dataset['TERRORISM'] == 'YES'].index)
no_terrorism = len(dataset[dataset['TERRORISM'] == 'NO'].index)

labels = ['Terrorism', 'Non-Terrorism']
counts = [terrorism, no_terrorism]

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.pie(counts,
                           shadow=True,
                           labels=labels,
                           autopct="%1.1f%%")

ax.set_title('Terrorism Distribution')

In [None]:
"""Vehicle Make Distribution"""

without_null_vehicle_makes = dataset[~pd.isnull(dataset['VEH_MAKE'])]
vehicle_makes = list(without_null_vehicle_makes['VEH_MAKE'].unique())

counts = []
for make in vehicle_makes:
    counts.append(len(without_null_vehicle_makes[without_null_vehicle_makes['VEH_MAKE'] == make].index))
    
make_counts = sorted(zip(vehicle_makes, counts), key=itemgetter(1), reverse=True)
makes, counts = zip(*make_counts)

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.bar(makes, counts)

ax.set_title('Vehicle Make Distribution')

for tick in ax.get_xticklabels():
    tick.set_rotation(90)

In [None]:
"""Race Distribution"""

without_null_race = dataset[~pd.isnull(dataset['RACE_DESC'])]
races = list(without_null_race['RACE_DESC'].unique())

counts = []
for race in races:
    counts.append(len(without_null_race[without_null_race['RACE_DESC'] == race].index))

race_counts = sorted(zip(races, counts), key=itemgetter(1), reverse=True)
races, counts = zip(*race_counts)

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.bar(races, counts)

ax.set_title('Race Distribution')

for tick in ax.get_xticklabels():
    tick.set_rotation(90)

In [None]:
"""Race Distribution Without Priors"""

without_null_race = dataset[~pd.isnull(dataset['RACE_DESC'])]
without_priors = without_null_race[without_null_race['PRIORS'] == 'NO']
races = list(without_priors['RACE_DESC'].unique())

counts = []
for race in races:
    counts.append(len(without_priors[without_priors['RACE_DESC'] == race].index))

race_counts = sorted(zip(races, counts), key=itemgetter(1), reverse=True)
races, counts = zip(*race_counts)

fig, ax = plt.subplots()
fig.set_size_inches((10, 10))

sex_distribution = ax.bar(races, counts)

ax.set_title('Race Distribution Without Priors')

for tick in ax.get_xticklabels():
    tick.set_rotation(90)

In [None]:
"""Decision Tree"""

label_feature = 'PRIORS'

important_features = dataset[
    [
        'SEX',
        'PRIORS',
        'COMPLEXION',
        'TERRORISM',
        'BASIS',
        'STOP_REASONS',
        'FIOFS_REASONS',
        'VEH_MAKE',
        'VEH_YEAR_NUM',
        'VEH_COLOR',
        'VEH_OCCUPANT',
        'VEH_STATE',
        'RACE_DESC',
        'AGE_AT_FIO_CORRECTED',
        'CITY',
    ]
]

# TODO: Remove Debug
important_features.to_csv('cleaned_decision_tree_features.csv', index=False)




# TODO: Conside the following features:
#     FIOFS_TYPE: Figure out what these mean
#     SEARCH: Figure out what these mean
#     OUTCOME: Figure out what these mean



# subset = dataset[['RACE_DESC', 'SEX', 'PRIORS']]
# features = subset[['RACE_DESC', 'SEX']]
# labels = subset['PRIORS']

# print(type(features))
# print(features)
# print(features.iloc[:, 1])

# label_encoder = preprocessing.LabelEncoder()
# for i in range(features.shape[1]):
#     features.iloc[:, i] = label_encoder.fit_transform(features.iloc[:, i])
#     print(features)

# X_train, X_test, y_train, y_test = train_test_split(subset[['RACE_DESC', 'SEX']],
#                                                     subset[['PRIORS']],
#                                                     test_size=0.33,
#                                                     random_state=42)


# model = tree.DecisionTreeClassifier()
# model.fit(X_train, y_train)
# pred = model.predict(X_test)

# accuracy_score(y_test, pred)