<a href="https://colab.research.google.com/github/ahan98/tda/blob/main/PredictingJudges.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
import pandas as pd
import numpy as np

In [39]:
%%time
csv = pd.read_csv("/content/drive/MyDrive/FinalDataset.csv", low_memory=False)
csv.shape
%store csv

CPU times: user 1min 4s, sys: 11.7 s, total: 1min 16s
Wall time: 1min 24s


In [61]:
# subset relevant variables
nominal = ["AMTFINEC", "SENTTOT", "ALTMO", "AGE", "ADJOFLHI"]
categorical = ["MONCIRC", "NEWEDUC", "NEWRACE", "MONSEX", "judge_clean"]
df = csv[nominal + categorical]
df.dropna(inplace=True)  # drop rows with missing data
df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


(548653, 10)

In [62]:
# encode each judge name with their alphabetical order
judge_names = sorted(set(df["judge_clean"]))
idxs = range(len(judge_names))
judge_dict = dict(zip(judge_names, idxs))
df = df.replace({"judge_clean": judge_dict})

In [63]:
df.head()

Unnamed: 0,AMTFINEC,SENTTOT,ALTMO,AGE,ADJOFLHI,MONCIRC,NEWEDUC,NEWRACE,MONSEX,judge_clean
0,0.0,24.0,0,26.0,24.0,0,1.0,3.0,0.0,31
1,0.0,8.0,0,29.0,12.0,0,1.0,3.0,0.0,31
2,0.0,3.0,0,36.0,8.0,0,1.0,3.0,0.0,31
3,0.0,6.0,0,61.0,22.0,0,5.0,3.0,0.0,31
4,0.0,18.0,0,43.0,14.0,0,5.0,2.0,0.0,31


In [64]:
vars_to_encode = ["NEWEDUC", "NEWRACE"]
df[vars_to_encode] = df[vars_to_encode].apply(pd.to_numeric, downcast="integer")
df.head()

Unnamed: 0,AMTFINEC,SENTTOT,ALTMO,AGE,ADJOFLHI,MONCIRC,NEWEDUC,NEWRACE,MONSEX,judge_clean
0,0.0,24.0,0,26.0,24.0,0,1,3,0.0,31
1,0.0,8.0,0,29.0,12.0,0,1,3,0.0,31
2,0.0,3.0,0,36.0,8.0,0,1,3,0.0,31
3,0.0,6.0,0,61.0,22.0,0,5,3,0.0,31
4,0.0,18.0,0,43.0,14.0,0,5,2,0.0,31


In [65]:
%store df

Stored 'df' (DataFrame)


# Predicting Judges

## Random Forest

Scikit-learn is currently unable to evaluate random forests via categorical variables (even though random forests are able to handle both categorical and nominal data, theoretically). Therefore, we must one-hot encode each of the categorical variables. (Note that "MONSEX" is already a binary indicator variable, so we do not need to manually one-hot encode it.)

In [69]:
# one-hot encode each categorical variable
encoded = df.iloc[:, :6]
for var in vars_to_encode:
    prefix = var[3:]  # e.g., prefix of "NEWEDUC" is "EDUC"
    temp = pd.get_dummies(df[var], prefix=prefix)
    encoded = encoded.join(temp)
encoded = encoded.join(df["judge_clean"])
encoded

Unnamed: 0,AMTFINEC,SENTTOT,ALTMO,AGE,ADJOFLHI,MONCIRC,EDUC_1,EDUC_3,EDUC_5,EDUC_6,RACE_1,RACE_2,RACE_3,RACE_6,judge_clean
0,0.0,24.0,0,26.0,24.0,0,1,0,0,0,0,0,1,0,31
1,0.0,8.0,0,29.0,12.0,0,1,0,0,0,0,0,1,0,31
2,0.0,3.0,0,36.0,8.0,0,1,0,0,0,0,0,1,0,31
3,0.0,6.0,0,61.0,22.0,0,0,0,1,0,0,0,1,0,31
4,0.0,18.0,0,43.0,14.0,0,0,0,1,0,0,1,0,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595845,0.0,60.0,0,35.0,30.0,11,0,1,0,0,1,0,0,0,1114
595846,0.0,80.0,0,41.0,36.0,11,0,1,0,0,0,1,0,0,1114
595847,1000.0,0.0,0,24.0,5.0,11,0,1,0,0,1,0,0,0,1114
595848,0.0,0.0,0,32.0,8.0,11,0,0,0,1,1,0,0,0,1114


In [70]:
%store encoded
#%store -r df

Stored 'encoded' (DataFrame)


Next, since "MONCIRC" denotes the district of the presiding judge, if we know the district for a given case, then we can eliminate all judges outside that district. Therefore, we train *one classifier per district* to improve performance and reduce memory requirements.

In [72]:
# partition the cases based on district of presiding judge
data_by_district = []
for district in range(12):
    temp = encoded[encoded["MONCIRC"] == district]
    temp = temp.drop(["MONCIRC"], axis=1)
    data_by_district.append(temp)

data_by_district[11]

Unnamed: 0,AMTFINEC,SENTTOT,ALTMO,AGE,ADJOFLHI,EDUC_1,EDUC_3,EDUC_5,EDUC_6,RACE_1,RACE_2,RACE_3,RACE_6,judge_clean
520711,0.0,21.0,0,55.0,20.0,0,1,0,0,1,0,0,0,258
520712,0.0,8.0,0,48.0,12.0,0,0,1,0,0,0,1,0,258
520715,0.0,6.0,0,36.0,10.0,0,0,1,0,1,0,0,0,258
520716,0.0,60.0,0,44.0,26.0,1,0,0,0,0,1,0,0,258
520747,0.0,48.0,0,29.0,26.0,1,0,0,0,1,0,0,0,663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595845,0.0,60.0,0,35.0,30.0,0,1,0,0,1,0,0,0,1114
595846,0.0,80.0,0,41.0,36.0,0,1,0,0,0,1,0,0,1114
595847,1000.0,0.0,0,24.0,5.0,0,1,0,0,1,0,0,0,1114
595848,0.0,0.0,0,32.0,8.0,0,0,0,1,1,0,0,0,1114


In [97]:
# Source: https://machinelearningmastery.com/random-forest-ensemble-in-python/

# evaluate random forest algorithm for classification
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# define dataset
df0 = data_by_district[0]
X, y = df0.iloc[:, :-1], df0.iloc[:, -1]
# define the model
model = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))



Accuracy: 0.091 (0.015)
