# BinaryClassifier to classify case into wether they are more likely to get acquitted or convicted

### Note: the trained model is saved in a pickle file at the end of this notebook

---
### *Preprocessing:*

In [183]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve  
import matplotlib.pyplot as plt
import pickle 

In [157]:
dataset = pd.read_csv("./precog_data/dataset.csv")
dataset = dataset.drop(["Unnamed: 0"], axis = 1)

In [158]:
df = pd.read_csv("./precog_data/csv/keys/disp_name_key.csv")

In [159]:
dataset_1 = pd.merge(
    left = dataset,
    right = df,
    on = "disp_name",
    how="left"
)

In [160]:
dataset_1 = dataset_1[(dataset_1["disp_name_s"] == "acquitted") | (dataset_1["disp_name_s"] == "convicted")]

In [161]:
dataset = dataset_1.drop(["count","year_x"], axis = 1)

---
###** Assigning boolean values to the disposition values of interest: ** 

In [162]:
def boolean_val(s):
    if s == "acquitted":
         return 1
    else:
        return 0

In [163]:
dataset["bool_disp"] = dataset.apply(lambda x : boolean_val(x["disp_name_s"]), axis = 1)

In [164]:
dataset = dataset.drop(["disp_name_s", "disp_name"], axis = 1)

---
### *Here we assign columns for numerical data and categorial data for better performance of a prediction model*

In [165]:
numerical_cols = ["filing_after_2010", "first_list_after_2010", "judge_exp_first"]
categorical_cols = ["female_judge","state_code","female_defendant","female_petitioner","criminal", "year_y"]
label_cols = ["bool_disp"]

### *Since we have so much categorial data, we use OneHotEncoder to encode all the categorial data, anwe use StandardScaling to standardize numerical data*

In [166]:
categorial_preprocessor = OneHotEncoder(handle_unknown='ignore')
numerical_preprocessor  = StandardScaler()

In [167]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_cols),
    ('standard_scaler', numerical_preprocessor, numerical_cols),
])

### *RandomForestClassifier was chosen because DecisionTrees in general are the good at dealing with large amounts of categorial data Also, this classifier scales well with large data. Since we use 100 base node trees, our model is less likely to suffer from overfitting*

In [168]:
model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 100))

In [169]:
x = dataset[['female_judge', 'state_code', 'female_defendant', 'female_petitioner',
       'criminal', 'year_y' , 'filing_after_2010',
       'first_list_after_2010', 'judge_exp_first']]
y = dataset['bool_disp']

In [170]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [171]:
%%time
model.fit(x_train, y_train)

CPU times: user 1h 13min 9s, sys: 9.75 s, total: 1h 13min 18s
Wall time: 1h 13min 22s


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['female_judge', 'state_code',
                                                   'female_defendant',
                                                   'female_petitioner',
                                                   'criminal', 'year_y']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['filing_after_2010',
                                                   'first_list_after_2010',
                                                   'judge_exp_first'])])),
                ('randomforestclassifier', RandomForestClassifier())])

---
### Test Score:

In [172]:
model.score(x_test, y_test) * 100

99.51983159626656

### *As seen in the scoring metric on randomized test data, this model acheives a 99.5% accuracy, which means our model didnt overfit. Also, a 99.5% accuracy while only using baseline data (data that is available at the start of a case, and is very easy to fetch) means that this model very accurately predict wether a defendant is going to be convicted to acquited,  usually far before the judge makes a decision.*
---

### Confusion Matrix : 

In [177]:
y_pred = model.predict(x_test)

In [176]:
confusion_matrix(y_test, y_pred)

array([[ 291765,    4419],
       [   3519, 1353467]])

### *The confusion matrix has a much higher density on the bottom right diagonal which shows that the model is accurate*
---

---
### Log_Loss:

In [184]:
logloss = log_loss(y_test, model.predict_proba(x_test))
logloss


0.0072268075459298265

### *The cross entropy loss is very close to 0, indicative of a high performance model*
---

In [180]:
# Run this cell to save classifier
f = open("./precog_data/my_pickle", "wb")
pickle.dump(model, f)
f.close()

In [None]:
# Run this cell to get classifier from pickle
f = open("./precog_data/my_pickle", "rb")
model = pickle.load(f)
f.close

---