In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import collections as co
from operator import itemgetter

In [2]:
df = pd.read_csv("1996_normed.csv")
df = df[df.clerk_school != '9999']
y = df['clerk_school']
X = df.drop(['clerk_school'], axis=1)

In [9]:
X.head(5)

Unnamed: 0,year,judge,a,aboard,about,absent,across,after,against,ahead,...,whose,will,with,within,without,would,yet,you,your,yourself
0,1995,Arnold_Richard_S,0.019964,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,...,0.125,0.038462,0.032,0.033333,0.04,0.007937,0.0,0.0,0.0,0.0
1,1995,Arnold_Richard_S,0.019964,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,...,0.125,0.038462,0.032,0.033333,0.04,0.007937,0.0,0.0,0.0,0.0
2,1995,Arnold_Richard_S,0.019964,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,...,0.125,0.038462,0.032,0.033333,0.04,0.007937,0.0,0.0,0.0,0.0
3,1995,Arnold_Richard_S,0.019964,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,...,0.125,0.038462,0.032,0.033333,0.04,0.007937,0.0,0.0,0.0,0.0
4,1995,Winter_Ralph_K,0.07441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.019231,0.032,0.033333,0.0,0.047619,0.0,0.0,0.0,0.0


In [10]:
X['judge'].describe()

count                10471
unique                 175
top       Ripple_Kenneth_F
freq                   225
Name: judge, dtype: object

In [7]:
X['year'].describe()

count    10471.0
mean      1995.0
std          0.0
min       1995.0
25%       1995.0
50%       1995.0
75%       1995.0
max       1995.0
Name: year, dtype: float64

In [3]:
import re

def labelCleanUp(label):
    label = label.lower()
    label = label.replace('nyu', 'new york')
    label = label.replace('smu', 'southern methodist')
    label = label.replace('case western reserve', 'case western')
    pattern1 = 'law|school|college|uc|university|(university of )|(_u)|( u$)|of'
    pattern2 = '[-_]'
    label = re.sub(pattern1, '', label)
    label = re.sub(pattern2, ' ', label)
    label = label.lstrip().rstrip()
    return label

In [4]:
#split trainng dataset and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
##get the judge name and year from trainning dataset
judgeYearList_train = pd.concat([X_train['judge'],X_train['year']],axis =1)
judgeYearList_train = judgeYearList_train.reset_index(drop=True)
##get the judge name and year from test dataset
judgeYearList_test = pd.concat([X_test['judge'],X_test['year']],axis =1)
judgeYearList_test = judgeYearList_test.reset_index(drop=True)

In [6]:
#drop judge from X 
X_train = X_train.drop(['judge'], axis=1)
X_test =  X_test.drop(['judge'], axis=1)

In [7]:
#fit model
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e8)
logreg.fit(X_train, y_train)

LogisticRegression(C=100000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
predY_train = logreg.predict_proba(X_train)
predY_test = logreg.predict_proba(X_test)


In [9]:
def predDist(y, model):
    label = model.classes_
    predList = []
    for i in range(len(y)):
        pred_dist = {}
        for j in range(len(label)):
            pred_dist.update({label[j]: y[i][j]})
        ## sort by value 
        from operator import itemgetter
        sort = sorted(pred_dist.items(), key=itemgetter(1), reverse=True)
        predList.append(sort)
    return predList

In [10]:
def findLawSch(judgeName, year, dataset):
    
    sch_list = []
    
    for i in range(len(dataset['Judge Name'])):
            if( dataset['Judge Name'][i] == judgeName and dataset['Year'][i] == year):
                sch_list.append(dataset['Clerk Law School'][i])
                
    cnt = co.Counter(sch_list)
    prob_dict = {}
    for sch,ct in cnt.items():
        prob_dict.update({sch: ct/len(sch_list)})
                
    return sch_list

In [11]:
def score(sch_list, pred_list):
    correct = 0
    unique_sch = list(set(sch_list))
    for sch in unique_sch:
        for i in range(len(sch_list)):
            if sch == pred_list[i][0]:
                correct += 1
    score = correct / len(sch_list)
    return score

In [12]:
def aveScore(pred_Y, judge_year_list, model_name,dataset):
    total_score = 0
    pred_list = predDist(pred_Y, model_name)
    for i in range(len(pred_list)):
        sch_list = findLawSch(judge_year_list['judge'][i], judge_year_list['year'][i], dataset)
        sigle_score = score(sch_list, pred_list[i])
        total_score += sigle_score
    ave_score = total_score / len(pred_list)
    return ave_score
    

In [13]:
clerk_data = pd.read_excel('ClerksMasterList0811.xls')

# Logit 

## score using our metric

In [14]:
train_ave_score = aveScore(predY_train, judgeYearList_train, logreg, clerk_data )
train_ave_score

0.3156479477057603

In [15]:
test_ave_score = aveScore(predY_test, judgeYearList_test, logreg, clerk_data )
test_ave_score

0.29619302266361275

# Logit
## score using built-in metric

In [16]:
logreg.score(X_train, y_train)

0.2242455112695785

In [17]:
logreg.score(X_test, y_test)

0.15508021390374332

# svm

In [18]:
from sklearn.svm import SVC
clf = SVC(probability= True)
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# SVM
## score using built-in metric

In [19]:
clf.score(X_train, y_train)

0.12670317076276583

In [20]:
clf.score(X_test, y_test)

0.1371275783040489

In [21]:
svm_predY_train = clf.predict_proba(X_train)
svm_predY_test = clf.predict_proba(X_test)

## score using our metric

In [22]:
svm_train_ave_score = aveScore(svm_predY_train, judgeYearList_train, clf, clerk_data )
svm_train_ave_score

0.29283925463729144

In [23]:
svm_test_ave_score = aveScore(svm_predY_test, judgeYearList_test, clf, clerk_data )
svm_test_ave_score 

0.2846638655462205

# NNet

In [24]:
from sklearn.neural_network import MLPClassifier
nnet = MLPClassifier(activation='logistic', solver='sgd', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1, max_iter=500)
nnet.fit(X_train, y_train)

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='sgd', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [25]:
nnet_predY_train = nnet.predict_proba(X_train)
nnet_predY_test = nnet.predict_proba(X_test)

In [26]:
nnet.score(X_train, y_train)

0.12670317076276583

In [27]:
nnet.score(X_test, y_test)

0.1371275783040489

In [28]:
nnet_train_ave_score = aveScore(nnet_predY_train, judgeYearList_train, nnet, clerk_data )
nnet_train_ave_score

0.2204889851012291

In [29]:
nnet_test_ave_score = aveScore(nnet_predY_test, judgeYearList_test, nnet, clerk_data )
nnet_test_ave_score

0.22469442322383476

# Random Forest


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [31]:
rf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)

In [32]:
rf.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [33]:
scores = cross_val_score(rf, X_train, y_train)
scores.mean()



0.13280779050535504

In [34]:
rf_predY_train = rf.predict_proba(X_train)
rf_predY_test = rf.predict_proba(X_test)

## score using our metric

In [35]:
rf_train_ave_score = aveScore(rf_predY_train, judgeYearList_train, rf, clerk_data )
rf_train_ave_score

0.6911562460206353

In [36]:
rf_test_ave_score = aveScore(rf_predY_test, judgeYearList_test, rf, clerk_data )
rf_test_ave_score

0.4701935319582382

# RF
# score using built-in metric

In [37]:
scores = cross_val_score(rf, X_train, y_train)
scores.mean()



0.13280779050535504

In [38]:
scores = cross_val_score(rf, X_test, y_test)
scores.mean()



0.1254515579534374