In [1036]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import  psycopg2

from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

In [1037]:
conn = psycopg2.connect(
    host='localhost', 
    dbname='cpdp', 
    user='blaine', 
    password='postgres'
)

In [1201]:
query = """
WITH officer_subset AS (
    SELECT
           o.id,
           o.first_name,
           o.last_name,
           o.birth_year,
           o.appointed_date,
           date_part('year', '2018-01-01'::DATE) - o.birth_year as estimated_age,
           ('2018-01-01'::DATE - o.appointed_date) / 365 as years_on_force,
           COUNT(a.id) as allegation_count
        FROM data_officer o
        LEFT JOIN data_officerallegation a on o.id = a.officer_id
        WHERE active = 'Yes'
            AND appointed_date BETWEEN '2000-01-01' AND '2011-12-31'
        GROUP BY o.id
        ORDER BY years_on_force DESC
)
SELECT allegation.id as allegation_id,
           CASE
               WHEN allegation.crid LIKE 'C%' THEN SUBSTRING(crid, 2)::integer
               ELSE allegation.crid::integer
           END as crid,
           oa.id as officerallegation_id,
           officer.id as officer_id,
           officer.appointed_date as appointed_date,
            allegation.incident_date as incident_date,
           category.category as category,
           (SELECT COUNT(*)
            FROM data_officerallegation
            JOIN data_allegation d on data_officerallegation.allegation_id = d.id
            WHERE officer_id = officer.id
            AND d.incident_date > officer.appointed_date
            AND d.incident_date < (allegation.incident_date + interval '2 year')
            AND disciplined = True
            ) as disciplined,
           (SELECT COUNT(*)
               FROM trr_trr
               WHERE trr_trr.officer_id = officer.id
               AND trr_trr.trr_datetime < (allegation.incident_date + interval '2 year')) as trr_count,
           (SELECT COUNT(*)
            FROM data_officerallegation
            JOIN data_allegation d on data_officerallegation.allegation_id = d.id
            WHERE officer_id = officer.id
            AND d.incident_date > officer.appointed_date
            AND d.incident_date < (allegation.incident_date + interval '2 year')) as total_allegation_count,
           (CASE WHEN (SELECT civilian_allegation_percentile FROM data_officer WHERE data_officer.id = officer.id) > 75.0 THEN 1
            ELSE 0
            END) as percent_repeater_status,
            (CASE WHEN (
                SELECT
                       COUNT(*)
                FROM data_officerallegation
                JOIN data_allegation on data_officerallegation.allegation_id = data_allegation.id
                JOIN data_allegationcategory on data_officerallegation.allegation_category_id = data_allegationcategory.id
                WHERE data_officerallegation.officer_id = oa.officer_id
                AND data_allegation.incident_date < (officer.appointed_date + interval '7 year')
                AND data_allegationcategory.category NOT IN ('Operation/Personnel Violations',
                       'Lockup Procedures',
                       'Traffic',
                       'Supervisory Responsibilities',
                       'Unknown',
                       'Medical')) >= 5 THEN 1 ELSE 0 END) as repeater_status
    FROM cp5_officer_subset officer
    JOIN data_officerallegation oa ON oa.officer_id = officer.id
    JOIN data_allegation allegation on oa.allegation_id = allegation.id
    JOIN data_allegationcategory category on oa.allegation_category_id = category.id
    WHERE allegation.id IN (
        SELECT d.id
        FROM data_officerallegation
        JOIN data_allegation d on data_officerallegation.allegation_id = d.id
        WHERE d.is_officer_complaint = False
        AND officer_id = officer.id
        AND d.incident_date > (officer.appointed_date)
        AND d.incident_date < (officer.appointed_date + interval '2 year')
        ORDER BY d.incident_date LIMIT 1);
"""

data_df = pd.read_sql_query(
    query,
    conn
)

data_df.head()

Unnamed: 0,allegation_id,crid,officerallegation_id,officer_id,appointed_date,incident_date,category,disciplined,trr_count,total_allegation_count,percent_repeater_status,repeater_status
0,178147,263725,112351,15420,2000-01-24,2000-07-05 19:00:00-05:00,Operation/Personnel Violations,0,0,2,0,0
1,178229,263870,112485,16820,2000-02-28,2000-07-11 19:00:00-05:00,Lockup Procedures,0,0,2,0,0
2,178940,265038,113610,20729,2000-01-24,2000-08-21 19:00:00-05:00,Use Of Force,0,0,3,1,1
3,179061,265234,113797,3006,2000-01-24,2000-08-18 19:00:00-05:00,Operation/Personnel Violations,0,0,3,1,0
4,179247,265568,114097,22295,2000-01-24,2000-09-09 19:00:00-05:00,Operation/Personnel Violations,0,0,6,0,0


In [1202]:
le = LabelEncoder()

encoded_categories = data_df[data_df.columns[6:7]].apply(le.fit_transform)
data_df['category'] = encoded_categories
data_df.head(25)

Unnamed: 0,allegation_id,crid,officerallegation_id,officer_id,appointed_date,incident_date,category,disciplined,trr_count,total_allegation_count,percent_repeater_status,repeater_status
0,178147,263725,112351,15420,2000-01-24,2000-07-05 19:00:00-05:00,8,0,0,2,0,0
1,178229,263870,112485,16820,2000-02-28,2000-07-11 19:00:00-05:00,7,0,0,2,0,0
2,178940,265038,113610,20729,2000-01-24,2000-08-21 19:00:00-05:00,12,0,0,3,1,1
3,179061,265234,113797,3006,2000-01-24,2000-08-18 19:00:00-05:00,8,0,0,3,1,0
4,179247,265568,114097,22295,2000-01-24,2000-09-09 19:00:00-05:00,8,0,0,6,0,0
5,179254,265577,114111,27063,2000-01-24,2000-09-10 19:00:00-05:00,6,0,0,6,1,1
6,179599,266166,114654,28620,2000-01-24,2000-09-30 19:00:00-05:00,8,1,0,6,1,1
7,179624,266209,114690,6821,2000-06-19,2000-09-26 19:00:00-05:00,2,0,0,7,1,1
8,180010,266867,115320,16043,2000-01-24,2000-10-27 19:00:00-05:00,7,0,0,9,1,1
9,180296,267423,115799,12342,2000-06-19,2000-11-23 18:00:00-06:00,12,0,0,4,1,1


In [1203]:
features = data_df[['category', 'trr_count', 'disciplined', 'total_allegation_count']]
targets = data_df[['repeater_status']]

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.15)
y_train=np.ravel(y_train)

In [1204]:
clf = SVC(gamma='auto')
clf.fit(X_train, np.ravel(y_train))

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [1205]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82       203
           1       0.74      0.53      0.62       117

    accuracy                           0.76       320
   macro avg       0.75      0.71      0.72       320
weighted avg       0.76      0.76      0.75       320



In [1206]:
mlp = MLPClassifier()
mlp.fit(X_train, np.ravel(y_train))

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [1207]:
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       203
           1       0.73      0.64      0.68       117

    accuracy                           0.78       320
   macro avg       0.77      0.75      0.76       320
weighted avg       0.78      0.78      0.78       320



In [1217]:
y_pred = mlp.predict_proba(X_test)
y_pred = [1 if x[1]>0.45 else 0 for x in y_pred]
print(classification_report(y_test, y_pred))

[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1]
              precision    recall  f1-s

In [1209]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

In [1210]:
y_pred = dtc.predict_proba(X_test)
y_pred = [1 if x[1]>0.1 else 0 for x in y_pred]
print(classification_report(y_test, y_pred))

[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-s

In [1211]:
y_pred = dtc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       203
           1       0.75      0.56      0.64       117

    accuracy                           0.77       320
   macro avg       0.76      0.72      0.73       320
weighted avg       0.77      0.77      0.76       320



In [1199]:
lr = LogisticRegression()
lr = lr.fit(X_train, np.ravel(y_train))

In [1200]:
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89       239
           1       0.74      0.43      0.55        81

    accuracy                           0.82       320
   macro avg       0.79      0.69      0.72       320
weighted avg       0.81      0.82      0.80       320

