In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
df_train = pd.read_csv('case_level_train_set.csv', encoding='ISO 8859-1', dtype='object', index_col=0)

In [5]:
df_train.columns

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes'],
      dtype='object')

In [6]:
df_train.lcDisagreement.describe()

count     22918
unique        2
top           0
freq      20721
Name: lcDisagreement, dtype: object

In [7]:
dropcols = ['docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitionerState', 
       'respondentState', 'adminAction', 'adminActionState', 'caseOriginState', 'threeJudgeFdc',
       'caseSourceState', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'lcDisposition',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes']

for col in dropcols:
    df_train.drop(col, axis=1, inplace=True)




In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22936 entries, 0 to 19860
Data columns (total 12 columns):
caseId                    22936 non-null object
naturalCourt              22936 non-null object
petitioner                22934 non-null object
respondent                22932 non-null object
jurisdiction              22935 non-null object
caseOrigin                22181 non-null object
caseSource                22494 non-null object
lcDisagreement            22918 non-null object
certReason                22872 non-null object
lcDispositionDirection    21989 non-null object
issueArea                 22840 non-null object
decisionDirection         22859 non-null object
dtypes: object(12)
memory usage: 2.3+ MB


In [9]:
df_train.dropna(axis=0, inplace=True)

In [10]:
df_train.columns

Index(['caseId', 'naturalCourt', 'petitioner', 'respondent', 'jurisdiction',
       'caseOrigin', 'caseSource', 'lcDisagreement', 'certReason',
       'lcDispositionDirection', 'issueArea', 'decisionDirection'],
      dtype='object')

In [11]:
target = 'decisionDirection'
y_train = df_train[target]
X_train = df_train[[col for col in df_train.columns if col != target]]

In [12]:
X_train.head()

Unnamed: 0,caseId,naturalCourt,petitioner,respondent,jurisdiction,caseOrigin,caseSource,lcDisagreement,certReason,lcDispositionDirection,issueArea
0,1946-001,1301,198,172,6,51,29,0,11,1,8
1,1946-002,1301,100,27,1,123,30,0,4,1,1
2,1946-003,1301,209,27,2,107,107,0,1,2,8
3,1946-004,1301,27,170,1,3,3,0,10,2,2
5,1946-006,1301,198,4,2,302,300,1,1,2,8


In [13]:
X_train.describe()

Unnamed: 0,caseId,naturalCourt,petitioner,respondent,jurisdiction,caseOrigin,caseSource,lcDisagreement,certReason,lcDispositionDirection,issueArea
count,21575,21575,21575,21575,21575,21575,21575,21575,21575,21575,21575
unique,21575,105,263,253,12,203,186,2,13,3,14
top,1883-106,706,195,27,1,302,300,0,1,1,8
freq,1,1250,1915,2310,7875,4444,4021,19676,13729,9947,6471


In [14]:
parties = pd.read_csv("codelist.csv", index_col=None)
parties = parties.applymap(lambda x: int(x))
def party_categorizer(val):
    if val in parties.State.values:
        return '1'
    elif val in parties.Federal.values:
        return '2'
    elif val in parties.Criminal.values:
        return '3'
    elif val in parties.Liberal.values:
        return '4'
    elif val in parties.RTY.values:
        return '5'
    elif val in parties.Finance.values:
        return '6'
    elif val in parties.Business.values:
        return '7'
    elif val in parties.Political.values:
        return '8'
    elif val in parties.famend.values:
        return '9'
    else:
        return '10'
 

def issue_consolidate(num):
    if num == "None":
        return '13'
    if int(num) in [0, 3, 5, 6, 7, 11, 14]:
        return '13'
    else:
        return str(num)

In [15]:
X_train.issueArea = X_train.issueArea.apply(lambda x: issue_consolidate(x))
X_train.petitioner = X_train.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_train.respondent = X_train.respondent.apply(lambda x: party_categorizer(int(x)))
X_train.jurisdiction = X_train.jurisdiction.apply(lambda x: '1' if x=='1' else '0')
X_train.certReason = X_train.certReason.apply(lambda x: '1' if x == '1' or x == '12' else '0')
X_train.lcDisagreement = X_train.lcDisagreement.apply(lambda x: 1 if x == '1' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21575 entries, 0 to 19857
Data columns (total 11 columns):
caseId                    21575 non-null object
naturalCourt              21575 non-null object
petitioner                21575 non-null object
respondent                21575 non-null object
jurisdiction              21575 non-null object
caseOrigin                21575 non-null object
caseSource                21575 non-null object
lcDisagreement            21575 non-null int64
certReason                21575 non-null object
lcDispositionDirection    21575 non-null object
issueArea                 21575 non-null object
dtypes: int64(1), object(10)
memory usage: 2.0+ MB


In [17]:
X_train_full = X_train
X_train = X_train_full.iloc[:, 1:]

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21575 entries, 0 to 19857
Data columns (total 10 columns):
naturalCourt              21575 non-null object
petitioner                21575 non-null object
respondent                21575 non-null object
jurisdiction              21575 non-null object
caseOrigin                21575 non-null object
caseSource                21575 non-null object
lcDisagreement            21575 non-null int64
certReason                21575 non-null object
lcDispositionDirection    21575 non-null object
issueArea                 21575 non-null object
dtypes: int64(1), object(9)
memory usage: 1.8+ MB


In [19]:
ohc = OneHotEncoder(sparse=False)
ohc.fit(X_train)
X_train = ohc.transform(X_train)


In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

lin_model = SVC(kernel='linear')

scores = cross_val_score(lin_model, X_train, y_train, cv=5)
sm = scores.mean()
ss = scores.std()

In [21]:
print(sm, ss)

0.54027413081 0.0682070148124


In [23]:
lin_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
df_test = pd.read_csv('case_level_test_set.csv', encoding='ISO 8859-1', dtype='object', index_col=0)
for col in dropcols:
    df_test.drop(col, axis=1, inplace=True)
df_test.dropna(axis=0, inplace=True)
y_test = df_test[target]
X_test = df_test[[col for col in df_test.columns if col != target]]
X_test.issueArea = X_test.issueArea.apply(lambda x: issue_consolidate(x))
X_test.petitioner = X_test.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_test.respondent = X_test.respondent.apply(lambda x: party_categorizer(int(x)))
X_test.jurisdiction = X_test.jurisdiction.apply(lambda x: '1' if x=='1' else '0')
X_test.certReason = X_test.certReason.apply(lambda x: '1' if x == '1' or x == '12' else '0')
X_test_full = X_test
X_test = X_test_full.iloc[:, 1:]
X_test = ohc.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [26]:
lin_model.score(X_test, y_test)

0.65517241379310343