In [1]:
import numpy as np
import pandas as pd
import math
from datetime import datetime

### Load Justice-centered data into pandas df

In [2]:
trainX = pd.read_csv('../data/trainX_justice.csv')
trainY = pd.read_csv('../data/trainY_justice.csv')
testX = pd.read_csv('../data/testX_justice.csv')
testY = pd.read_csv('../data/testY_justice.csv')

### Variables (from preprocessing)

In [3]:
# column names by category
id_variables_to_drop = [
    u'justiceName', #Name and unique ID
    ]
id_variables_to_keep = [
    u'justice',  #Name and unique ID
    u'caseId', u'docketId', u'caseIssuesId', u'voteId',
    u'usCite', u'sctCite', u'ledCite', u'lexisCite',
    u'docket']
    
bg_variables = [
    u'caseName', u'petitioner', u'petitionerState',
    u'respondent', u'respondentState', u'jurisdiction',
    u'adminAction', u'adminActionState', u'threeJudgeFdc',
    u'caseOrigin', u'caseOriginState', u'caseSource',
    u'caseSourceState', u'lcDisagreement', u'certReason',
    u'lcDisposition', u'lcDispositionDirection',
]

chrono_include = [u'naturalCourt', u'chief']
chrono_donotinclude = [u'dateDecision', u'decisionType', u'term',
                       u'dateArgument', u'dateRearg']
chrono_variables = chrono_include + chrono_donotinclude

substantive_variables = [
    u'issue', u'issueArea', u'decisionDirection',
    u'decisionDirectionDissent', u'authorityDecision1',
    u'authorityDecision2', u'lawType', u'lawSupp', u'lawMinor']

outcome_variables = [
    u'declarationUncon', u'caseDisposition',
    u'caseDispositionUnusual', u'partyWinning', u'precedentAlteration',  
    u'firstAgreement', u'secondAgreement']

voting_variables = [u'voteUnclear', u'majOpinWriter', u'majOpinAssigner',
                    u'splitVote', u'majVotes', u'minVotes',  u'vote', u'opinion',
                    u'direction', u'majority']


## Drop unimportant ID variables

In [4]:
trainX_id = trainX[id_variables_to_drop].copy()
trainX = trainX[id_variables_to_keep + bg_variables + chrono_include + substantive_variables].copy()

## One-hot encode select categorical variables
#### + impute nulls

In [43]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
class Categorical(object):
    def __init__(self, df, ohe_threshold = 1000):
        self.arr = df.copy()
        self.arr = self.arr.fillna(-1) # Impute values
        # Select the categorical features to OHE
        ohe_vars = []
        other_vars = []
        for c in self.arr.columns:
            u =  len(self.arr[c].unique())
            if u < ohe_threshold:
                print "{0}: {1} unique".format(c, len(self.arr[c].unique()))
                ohe_vars.append(c)
            else:
                other_vars.append(c)
        self.n_samples, self.n_features = self.arr.shape
        self.n_cat_features = len(ohe_vars)
        self.ohe_vars = ohe_vars
        self.other_vars = other_vars
        self.arr = pd.concat([self.arr[other_vars], self.arr[ohe_vars]], axis=1)
        self.columns = other_vars + ohe_vars
        self.other = self.arr[other_vars].copy() #ID and other features
        self.cat = self.arr[ohe_vars].copy()
        #Label Encode select categorical variables
        self.LE()
        self.cat_ohe = self.cat.copy()
        self.n_justices = len(self.cat['justice'].unique())
        #OHE the categorical data
        self.OHE()
        
        
    def LE(self):
        '''Label Encode cat variables (to give reasonable feautures) '''  
        le = []  #keep le global so we may inverse_transform if necessary
        i = 0
        for c in self.ohe_vars: #text_cols:
            le.append(LabelEncoder())
            le[i].fit(self.cat[c])
            self.cat.loc[:,c] = le[i].transform(self.cat[c])
            i = i + 1
        self.le = le # label encoder (useful for reverse transforming also)
                              
    def OHE(self):
        self.ohe = OneHotEncoder()
        self.ohe.fit(self.cat_ohe)
        self.cat_ohe =  self.ohe.transform(self.cat_ohe) 
    
    
    def orig_cat(self): # Not important since I cache the values, but keep for possible future functions
        '''Returns the original array of the OHE encoded csr array
            (requires the original one-hot encoder fit to the dataset) '''
        return np.array([self.ohe.active_features_[col] for col in 
                            self.cat.sorted_indices().indices]).reshape(self.n_samples, self.n_cat_features) \
                                - self.ohe.feature_indices_[:-1]
        
    def getValue(self, col):
        '''Return the original value for the label-encoded variable'''
        if col not in self.ohe_vars:
            return None
        le = self.le[self.ohe_vars.index(col)]
        return le.inverse_transform(self.cat[col])

    def isolate_justice(self, ID, deep=False, ind=True):
        '''Return sparse array for data for a single justice '''
        
        #Check if ID is a legal label-encodded justice id
        if ID < 0 or ID >= self.n_justices:
            print "Error: Not a legal justice ID"
            return None
        
        if ind:
            return self.cat.loc[self.cat['justice'] == ID].index
        elif deep:
            return self.cat.loc[self.cat['justice'] == ID].copy()
        else:
            return self.cat.loc[self.cat['justice'] == ID]

In [44]:
trainX_Cat = Categorical(df = trainX)

justice: 36 unique
petitioner: 260 unique
petitionerState: 57 unique
respondent: 240 unique
respondentState: 56 unique
jurisdiction: 11 unique
adminAction: 112 unique
adminActionState: 53 unique
threeJudgeFdc: 3 unique
caseOrigin: 132 unique
caseOriginState: 53 unique
caseSource: 109 unique
caseSourceState: 53 unique
lcDisagreement: 3 unique
certReason: 14 unique
lcDisposition: 13 unique
lcDispositionDirection: 4 unique
naturalCourt: 31 unique
chief: 5 unique
issue: 266 unique
issueArea: 15 unique
decisionDirection: 4 unique
decisionDirectionDissent: 3 unique
authorityDecision1: 8 unique
authorityDecision2: 8 unique
lawType: 9 unique
lawSupp: 176 unique
lawMinor: 828 unique


In [45]:
trainX_Cat.isolate_justice(5)

Int64Index([   1,   10,   19,   28,   42,   46,   55,   64,   73,   82,
            ...
            7873, 7882, 7891, 7900, 7909, 7918, 7927, 7936, 7945, 7954],
           dtype='int64', length=884)

#### Impute nulls

In [None]:
trainX = trainX.fillna(-1); # fill with -1

In [None]:
#Note that this really only avoids the IDs at this point... may have to be more discrete
ohe_threshold = 1000
ohe_variables = []
other_vars = []
for c in trainX.columns:
    u =  len(trainX[c].unique())
    if u < ohe_threshold:
        print "{0}: {1} unique".format(c, len(trainX[c].unique()))
        ohe_variables.append(c)
    else:
        other_vars.append(c)
n_cat_features = len(ohe_variables)
trainX = pd.concat([trainX[ohe_variables], trainX[other_vars]], axis=1)

#### LabelEncode the categorical (and text) fields to span from 0-n (without weird jumps)

In [None]:
# text_cols =  trainX[ohe_variables].columns[trainX[ohe_variables].dtypes == object]
# text_cols = ohe_variables
# print text_cols
le = []  #keep le global so we may inverse_transform if necessary
i = 0
for c in ohe_variables: #text_cols:
    le.append(LabelEncoder())
    le[i].fit(trainX[c])
    trainX[c] = le[i].transform(trainX[c])
    i = i + 1

In [None]:
enc = OneHotEncoder()
# SHIFT by 1 (so no categories are negative)
# trainX = shift(trainX, ohe_variables)
enc.fit(trainX[ohe_variables])
print enc.n_values_
print enc.feature_indices_

In [None]:
trainX_cat =  enc.transform(trainX[ohe_variables])
# trainX.loc[:, ohe_variables] = enc.transform(trainX[ohe_variables])
# print trainX

In [None]:
print enc.active_features_
print len(enc.active_features_)
print enc.feature_indices_
print enc.n_values_

In [None]:
print trainX_cat.indices
trainX_cat = trainX_cat.sorted_indices()
print trainX_cat.indices

In [None]:
trainX.columns

In [None]:
class categorical(object):
    
    def __init__(df, ohe_threshold = 1000):
        df.fillna(-1) # Impute values
        # Select the categorical features to OHE
        ohe_vars = []
        other_vars = []
        for c in self.columns:
            u =  len(df[c].unique())
            if u < ohe_threshold:
                print "{0}: {1} unique".format(c, len(df[c].unique()))
                ohe_variables.append(c)
            else:
                other_vars.append(c)
        self.n_samples, self.n_features = df.shape
        self.n_cat_features = len(ohe_variables)
        self.arr = pd.concat([df[other_vars], df[ohe_variables]], axis=1)
        self.columns = other_vars + ohe_vars
        self.other = df[other_vars] #ID and other features
        self.cat = df[ohe_variables]
        #Label Encode select categorical variables
        self.LE()                 
        #OHE the categorical data
        self.OHE()
           
        return df
    def LE(self):
        '''Label Encode cat variables (to give reasonable feautures) '''  
        le = []  #keep le global so we may inverse_transform if necessary
        i = 0
        for c in ohe_variables: #text_cols:
            le.append(LabelEncoder())
            le[i].fit(trainX[c])
            trainX[c] = le[i].transform(trainX[c])
            i = i + 1
        self.le = le # label encoder (useful for reverse transforming also)
                              
    def OHE(self):
        self.ohe = OneHotEncoder()
        self.ohe.fit(self.cat)
        self.cat =  self.ohe.transform(self.cat) 
    
    
    def orig_cat(self):
        '''Returns the original array of the OHE encoded csr array
            (requires the original one-hot encoder fit to the dataset) '''
        return np.array([self.ohe.active_features_[col] for col in 
                            self.cat.sorted_indices().indices]).reshape(self.n_samples, self.n_cat_features) \
                                - self.ohe.feature_indices_[:-1]

## Working Outline

* In progress...

The lack of data would indicate that an rnn may not be a particularly appropriate model for this task; however it would be fun to try in order to see how the sequence of issues or the long-term trends of a justice come into play for future decisions. 

### Workflow:

#### Data Preparation:

Impute values -> maybe categorize nan's as -1, etc. (There are no negative values here)

OHE categorical variables

Drop the justiceName column -> all information is in 'justice'


#### RNN

## Generate Features

Fill in nas with -1

In [None]:
for c in trainX.columns:
    print trainX[c].mode()
    print "Number NaN: {0}".format(np.sum(trainX[c].isnull()))

In [None]:
trainX['authorityDecision2'][0]