In [32]:
import string
import re
import numpy as np
import pandas as pd
import operator
import os
import sys
import io
import collections
import matplotlib
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree 
from sklearn import preprocessing
from sklearn import metrics
from sklearn import svm
from sklearn.cross_validation import train_test_split, cross_val_score

from sklearn.grid_search import GridSearchCV
from sklearn import svm

Variables  
* term: This variable identifies the term in which the Court handed down its decision.  
* naturalCourt: A natural court is a period during which no personnel change occurs. Scholars have subdivided them into "strong" and "weak" natural courts, but no convention exists as to the dates on which they begin and end. Options include 1) date of confirmation, 2) date of seating, 3) cases decided after seating, and 4) cases argued and decided after seating.  
* petitioner: Petitioner" refers to the party who petitioned the Supreme Court to review the case. This party is variously known as the petitioner or the appellant.
* respondent: Respondent" refers to the party being sued or tried and is also known as the appellee.
* caseOrigin: The focus of this variable is the court in which the case originated,
* caseSource: This variable identifies the court whose decision the Supreme Court reviewed. 
* lcDisposition: This variable specifies the treatment the court whose decision the Supreme Court reviewed accorded the decision of the court it reviewed; e.g., whether the court below the Supreme Court---typically a federal court of appeals or a state supreme court---affirmed, reversed, remanded, etc. the decision of the court it reviewed---typically a trial court. 
* issueArea: This variable simply separates the issues identified in the preceding variable (issue) into the following larger categories: criminal procedure, civil rights, First Amendment, due process, privacy , attorneys' or governmental officials' fees or compensation, unions, economic activity, judicial power, federalism, interstate relation, federal taxation, miscellaneous, and private law. 

In [2]:
def encode_variable(df, target_column):
    """Encode variable into a number.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

#from graphviz documentation
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [35]:
#create the target, and train
#read  the csv and make a big df
bigdf=pd.read_csv("SCDB_2015_01_justiceCentered_Citation.csv")

#create an empty small df
smalldf = pd.DataFrame()

#select the variables to run the classifier on 
#casedisposition is our target
train_vars = ['term', 'naturalCourt', 'petitioner',
                'respondent', 'caseOrigin', 'caseSource', 'lcDisposition', 'issueArea', 'partyWinning', 'majority']

# ["term", "naturalCourt ", "petitioner", "respondent", "caseOrigin", "caseSource", "lcDisposition", "issueArea"]
#add train columns to smalldf
smalldf = bigdf[train_vars]

#drop row if any values are NAN - maxmimum  22% of original data 
smalldf=smalldf.dropna(axis=0,how='any')

print smalldf.shape

# smalldf, _ = encode_variable(smalldf,'chief')
# smalldf, _ = encode_variable(smalldf, 'dateDecision')

# smalldf.majority refers to whether justice voted with the majority (1 for dissent, 2 for majority)
# smalldf.partyWinning indicates winning party (0 for responding party, 1 for petitioning party, 2 for unclear)
# We use the above 2 features to infer which party the individual justice voted for
# NOTE: majority has around 4000 NaNs that we should filter out?

results = []

for idx, x in smalldf.iterrows(): 
    if x.partyWinning == 2:
        results.append(2)
        
    if x.partyWinning == 1 and x.majority == 2: 
        results.append(1)
        
    if x.partyWinning == 0 and x.majority == 2: 
        results.append(0)
        
    if x.partyWinning == 1 and x.majority == 1: 
        results.append(0)
        
    if x.partyWinning == 0 and x.majority == 1:
        results.append(1)

smalldf['justiceVote'] = results

smalldf['is_train'] = np.random.uniform(0, 1, len(smalldf)) <= .75

train, test = smalldf[smalldf['is_train']==True], smalldf[smalldf['is_train']==False]

features = list(smalldf.columns[:8])

(62929, 10)


Decision Tree

In [29]:
dt = DecisionTreeClassifier(random_state=99)
dt.fit(train[features],train["justiceVote"])
visualize_tree(dt, features)
dt.score(test[features],test['justiceVote'])

(62929, 10)


0.74780058651026393

In [33]:
Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gs=GridSearchCV(dt, param_grid={'C':Cs}, cv=5)
gs.fit(train[features], train['justiceVote'])
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_

ValueError: Invalid parameter C for estimator DecisionTreeClassifier

RANDOM FOREST

In [17]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)

clf_result = clf.fit(train[features], train['justiceVote'])

clf_result.score(test[features], test['justiceVote'])

0.76283432627709469

In [34]:
Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gs=GridSearchCV(clf, param_grid={'C':Cs}, cv=5)
gs.fit(train[features], train['justiceVote'])
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_

ValueError: Invalid parameter C for estimator RandomForestClassifier

SVM

In [23]:
%%time

svc = svm.LinearSVC(loss="hinge")
svc_classifier = svc.fit(train[features], train['justiceVote'])
print svc_classifier.score(test[features], test['justiceVote'])


0.612435036126
CPU times: user 7.25 s, sys: 55.1 ms, total: 7.3 s
Wall time: 7.37 s


In [25]:

Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gs=GridSearchCV(svc, param_grid={'C':Cs}, cv=5)
gs.fit(train[features], train['justiceVote'])
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_

BEST {'C': 0.001} 0.5686411741 [mean: 0.56864, std: 0.09150, params: {'C': 0.001}, mean: 0.52289, std: 0.11207, params: {'C': 0.01}, mean: 0.56864, std: 0.09150, params: {'C': 0.1}, mean: 0.43136, std: 0.09150, params: {'C': 1.0}, mean: 0.52289, std: 0.11207, params: {'C': 10.0}, mean: 0.52287, std: 0.11208, params: {'C': 100.0}]


In [26]:
best = gs.best_estimator_
best.fit(train[features], train['justiceVote'])
best.score(test[features], test['justiceVote'])

0.61243503612625172