# LIONS Analysis

Our project is to analyze federal cases.

In [3]:
from lions.parse import *
import pandas as pd
import numpy as np

In [4]:
_, cases = gs_case(columns=['ID', 'DISTRICT', 'CLASS', 'LEAD_CHARGE'])
_, hist = gs_court_hist(columns=['ID','CASEID', 'DISTRICT', 'COURT', 'APPEAL_TYPE', 'SENT_APPEAL', \
                                 'DISPOSITION', 'DISP_REASON1', 'DISP_REASON2', 'DISP_REASON3'])
_, court_judge = gs_court_judge(columns=['ID','CASEID', 'DISTRICT', 'CRTHISID', 'JUDGEID', 'DECISION'])
_, judge = gs_judge(columns=['DISTRICT', 'ID', 'LAST_NAME', 'FIRST_NAME'])

In [5]:
cases_df = pd.DataFrame(cases, columns = cases[0].keys())
cases_df_cols = list(cases_df.columns)
cases_df_cols[0] = 'CASEID'
cases_df.columns = cases_df_cols
#del cases

hist_df = pd.DataFrame(hist, columns = hist[0].keys())
hist_df_cols = list(hist_df.columns)
hist_df_cols[0] = 'CRTHISID'
hist_df.columns = hist_df_cols
#del hist

court_judge_df = pd.DataFrame(court_judge, columns = court_judge[0].keys())
#del court_judge

judge_df = pd.DataFrame(judge, columns = judge[0].keys())
judge_df_cols = list(judge_df.columns)
judge_df_cols[1] = 'JUDGEID'
judge_df.columns = judge_df_cols
#del judge

print(len(cases_df))
print(len(hist_df))

6041748
8262351


In [6]:
cases_df = cases_df.sample(100000)

In [8]:
cases_hist_df = pd.merge(cases_df, 
                         hist_df, 
                         left_on = ['CASEID','DISTRICT'], 
                         right_on = ['CASEID','DISTRICT'], 
                         how = 'inner')
#del cases_df
#del hist_df
print("Done merging Cases DF and Hist DF")
print(len(cases_hist_df))

cases_hist_court_df = pd.merge(cases_hist_df, 
                               court_judge_df, 
                               left_on = ['CASEID','DISTRICT', 'CRTHISID'], 
                               right_on = ['CASEID','DISTRICT', 'CRTHISID'], 
                               how = 'inner')
#del cases_hist_df
#del court_judge_df
print("Done merging Cases Hist DF and Court Judge DF")
print(len(cases_hist_court_df))

cases_hist_court_judge_df = pd.merge(cases_hist_court_df, 
                                     judge_df, 
                                     left_on = ['JUDGEID','DISTRICT'], 
                                     right_on = ['JUDGEID','DISTRICT'], 
                                     how = 'inner')
#del cases_hist_court_df
#del judge_df
print("Done merging Cases Hist Court DF and Judge DF")
print(len(cases_hist_court_judge_df))


withdisp_df = cases_hist_court_judge_df[cases_hist_court_judge_df.DISPOSITION != '']
#del cases_hist_court_judge_df

withdisp_reason_df = withdisp_df[withdisp_df.DISP_REASON1 != '']
withoutdisp_reason_df = withdisp_df[withdisp_df.DISP_REASON1 == '']

Done merging Cases DF and Hist DF
136700
Done merging Cases Hist DF and Court Judge DF
83037
Done merging Cases Hist Court DF and Judge DF
83002


In [9]:
withdisp_reason_df.columns

Index(['CASEID', 'DISTRICT', 'CLASS', 'LEAD_CHARGE', 'CRTHISID', 'COURT',
       'APPEAL_TYPE', 'SENT_APPEAL', 'DISPOSITION', 'DISP_REASON1',
       'DISP_REASON2', 'DISP_REASON3', 'ID', 'JUDGEID', 'DECISION',
       'LAST_NAME', 'FIRST_NAME'],
      dtype='object')

In [11]:
columns_to_drop = ['ID','CASEID', 'CRTHISID','DISP_REASON1', 'DISP_REASON2',
                   'DISP_REASON3', 'JUDGEID', 'DECISION', 'LAST_NAME', 'FIRST_NAME']

disp_reasons = ['VACA','VARM','RDAP','RDRR','REVA','REVR','OFPO','GWDA','DEPO','LECI','WKEV','WTPR']


misconduct = []
disp1 = withdisp_reason_df['DISP_REASON1'].tolist()

for i in range(len(disp1)):
    if disp1[i] in disp_reasons: 
        misconduct.append(1)
    else:
        misconduct.append(0)


df_classify = withdisp_reason_df.drop(columns_to_drop, axis = 1)
df_to_classify = withoutdisp_reason_df.drop(columns_to_drop, axis=1)

df_classify.columns


Index(['DISTRICT', 'CLASS', 'LEAD_CHARGE', 'COURT', 'APPEAL_TYPE',
       'SENT_APPEAL', 'DISPOSITION'],
      dtype='object')

In [12]:
df_district = pd.get_dummies(df_classify['DISTRICT'], prefix = 'District')
df_charge = pd.get_dummies(df_classify['LEAD_CHARGE'], prefix = 'Charge')
df_class = pd.get_dummies(df_classify['CLASS'], prefix = 'Class')
df_court = pd.get_dummies(df_classify['COURT'], prefix = 'Court')
df_appeal = pd.get_dummies(df_classify['APPEAL_TYPE'], prefix = 'Appeal')
df_sent = pd.get_dummies(df_classify['SENT_APPEAL'], prefix = 'Sent')
df_disposition = pd.get_dummies(df_classify['DISPOSITION'], prefix = 'Disposition')

df_new = pd.concat([df_classify, df_district, df_charge, df_class, df_court, df_appeal, df_sent, df_disposition], axis=1)
del df_classify
del df_district
del df_charge
del df_class 
del df_court
del df_appeal
del df_sent
del df_disposition

df_new = df_new.drop(['DISTRICT','CLASS','LEAD_CHARGE', 'COURT', 'APPEAL_TYPE','SENT_APPEAL','DISPOSITION'], axis=1)
print("First step done")


df_district = pd.get_dummies(df_to_classify['DISTRICT'], prefix = 'District')
df_charge = pd.get_dummies(df_to_classify['LEAD_CHARGE'], prefix = 'Charge')
df_class = pd.get_dummies(df_to_classify['CLASS'], prefix = 'Class')
df_court = pd.get_dummies(df_to_classify['COURT'], prefix = 'Court')
df_appeal = pd.get_dummies(df_to_classify['APPEAL_TYPE'], prefix = 'Appeal')
df_sent = pd.get_dummies(df_to_classify['SENT_APPEAL'], prefix = 'Sent')
df_disposition = pd.get_dummies(df_to_classify['DISPOSITION'], prefix = 'Disposition')

df_new_to_classify = pd.concat([df_to_classify, df_district, df_charge, df_class, df_court, df_appeal, df_sent, df_disposition], axis=1)
df_new_to_classify = df_new_to_classify.drop(['DISTRICT','CLASS','LEAD_CHARGE', 'COURT', 'APPEAL_TYPE','SENT_APPEAL','DISPOSITION'], axis=1)
print("Second step done")

del df_to_classify
del df_district
del df_charge
del df_class 
del df_court
del df_appeal
del df_sent
del df_disposition

common_cols = list(set(df_new).intersection(df_new_to_classify))

df_new = df_new[common_cols]
df_new_to_classify = df_new_to_classify[common_cols]
df_new

First step done
Second step done


Unnamed: 0,Charge_21 :00846,Court_SC,Charge_18 :00922O,Charge_18 :01073,Charge_18 :01963,District_NJ,District_WVS,Charge_18 :00013,Charge_18 :01029,Charge_18 :01111,...,District_NCW,Charge_18 :01951,District_MIW,Court_MM,District_TNE,Charge_21 :00841a,Charge_50R:27.84,District_PAM,District_LAM,District_DC
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [16]:
#generate our train and test dataset
m = int(df_new.shape[0]*0.1)

df_to_be_classified = df_new_to_classify

df_train = df_new.sample(m)
df_test = df_new.drop(df_train.index)
list_train = misconduct[:m]
list_test = misconduct[m:]

df_train

Unnamed: 0,Charge_21 :00846,Court_SC,Charge_18 :00922O,Charge_18 :01073,Charge_18 :01963,District_NJ,District_WVS,Charge_18 :00013,Charge_18 :01029,Charge_18 :01111,...,District_NCW,Charge_18 :01951,District_MIW,Court_MM,District_TNE,Charge_21 :00841a,Charge_50R:27.84,District_PAM,District_LAM,District_DC
49534,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82603,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
47625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33977,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27664,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# from sklearn.neighbors import KNeighborsClassifier
# import sklearn.model_selection as cross_validation

# #build our knn model and print the accuracy score
# knn = KNeighborsClassifier(n_neighbors=10)
# knn.fit(df_train, list_train)

# #print the accuracy score
# print('KNeighbor accuracy on test data: ', knn.score(df_test, list_test))
# print('KNeighbor accuracy on training data: ', knn.score(df_train, list_train))

# #accuracy score from crossvalidation
# scores = cross_validation.cross_val_score(knn, df_new, misconduct, cv=5)
# print("KNN Accuracy from crossvalidation:: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

KNeighbor accuracy on test data:  0.958860892256316
KNeighbor accuracy on training data:  0.968
KNN Accuracy from crossvalidation:: 0.96 (+/- 0.01)


In [17]:
from sklearn import tree
import sklearn.model_selection as cross_validation

#build our decision trees model
dtc = tree.DecisionTreeClassifier()
dtc.fit(df_train, list_train)

#print the accuracy score
print('DT accuracy on test data: ', dtc.score(df_test, list_test))
print('DT accuracy on training data: ', dtc.score(df_train, list_train))

#accuracy score from crossvalidation
scores = cross_validation.cross_val_score(dtc, df_new, misconduct, cv=5)
print("DT Accuracy from crossvalidation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

DT accuracy on test data:  0.9963213597198726
DT accuracy on training data:  0.9996458923512748
DT Accuracy from crossvalidation: 0.99 (+/- 0.01)


In [18]:
from sklearn.linear_model import LogisticRegression

#build our logistic regression model
lgr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
lgr.fit(df_train, list_train)

#print the accuracy score
print("Accuracy of logistic regression test set:", lgr.score(df_test, list_test))
print("Accuracy of logistic regression train set:", lgr.score(df_train, list_train))

#accuracy score from crossvalidation
scores = cross_validation.cross_val_score(lgr, df_new, misconduct, cv=5)
print("Logistic Regression Accuracy from crossvalidation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy of logistic regression test set: 0.9964590628319628
Accuracy of logistic regression train set: 0.9994688385269122
Logistic Regression Accuracy from crossvalidation: 1.00 (+/- 0.00)


In [19]:
#Apply the classifier to the dataset withoutdisp_reason_df

df_to_be_classified

Unnamed: 0,Charge_21 :00846,Court_SC,Charge_18 :00922O,Charge_18 :01073,Charge_18 :01963,District_NJ,District_WVS,Charge_18 :00013,Charge_18 :01029,Charge_18 :01111,...,District_NCW,Charge_18 :01951,District_MIW,Court_MM,District_TNE,Charge_21 :00841a,Charge_50R:27.84,District_PAM,District_LAM,District_DC
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [20]:
df_prediction_knn = lgr.predict(df_to_be_classified)

df_results = withoutdisp_reason_df.copy()
df_results['misconduct'] = df_prediction_knn
df_misconduct1 = df_results[df_results['misconduct'] == 1]


df_result = withdisp_reason_df.copy()
df_result['misconduct'] = misconduct
df_misconduct2 = df_result[df_result['misconduct'] == 1]


df_misconduct = pd.concat([df_misconduct1,df_misconduct2],axis=0)
df_misconduct['DISTRICT'].value_counts()

CAS    49
WVS    18
MOE    17
ILN    11
MIE    11
NYE     7
WAW     6
OHN     5
NCE     5
NV      4
DC      4
GAM     4
TNW     3
INS     3
NYS     2
MA      2
FLM     2
VAE     2
ALN     2
TXN     2
ID      2
CAC     2
AZ      1
TXW     1
CAN     1
NJ      1
TXS     1
OR      1
PR      1
NYW     1
AK      1
CAE     1
KS      1
CT      1
NE      1
WY      1
HI      1
WAE     1
PAW     1
GAN     1
WIW     1
MSS     1
Name: DISTRICT, dtype: int64