In [129]:
%matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime

In [130]:
#############################################
# PANDAS HELPERS
#############################################

def remove_column_from_data_frame(col_to_remove, data_frame):

    if col_to_remove in list(data_frame.columns):
        data_frame.drop(col_to_remove, axis=1, inplace=True)

        
def remove_columns_from_data_frame(cols_to_remove, data_frame):

    column_dict = {x: None for x in list(data_frame.columns)}

    cols_to_remove = [x for x in cols_to_remove if x in column_dict]

    data_frame.drop(labels=cols_to_remove, axis=1, inplace=True)
    

def remove_columns_like(column_pattern, data_frame):
    
    for column in list(data_frame.columns):
        if column_pattern in column:
            data_frame.drop(column, axis=1, inplace=True)


def fill_nas(value, data_frame):
    
    data_frame.fillna(0, inplace=True)

In [131]:
#############################################
# DATA RETRIEVAL HELPERS
#############################################

def get_data(n_rows=None):

    if n_rows is not None:
        df = pd.read_csv('final_feats_without_dummies_2.csv', low_memory=False, nrows=n_rows)
        df_y = pd.read_csv('final_outs_2.csv', low_memory=False, nrows=n_rows)
    else:
        df = pd.read_csv('final_feats_without_dummies_2.csv', low_memory=False)
        df_y = pd.read_csv('final_outs_2.csv', low_memory=False)
    
    
    # Drop labels and a redundant column
    remove_columns_from_data_frame(['Unnamed: 0', 'Unnamed: 0.1' 'dissent', 'dissentdummy'], df)
    
    
    return df, df_y


def get_x_y(n_rows=None):
    
    df, df_y = get_data(n_rows)

    #fill_nas(0, df)
    
    return df.values, df_y.ix[:,1].values


def get_columns(df):
    
    #df = pd.read_csv('final_feats_without_dummies_2.csv', low_memory=False, nrows=2)
    return list(df.columns)


def print_report(y, y_pred):

    print classification_report(y, y_pred)
    


In [132]:
def drop_unneeded_cols(df):
    del_cols = ['fileid','cite','vol','beginpg','endopin','endpage','docnum','priorpub','_merge','year',
            'circuit','pseatno','decision_date','aatty_first_name','aatty_last_name','afirm_name',
            'ratty_first_name','ratty_last_name','rname_of_first_listed_amicus_gro','rfirm_namew','decisiondatenew2',
           'j1name','j2name','j3name','quartertoelect','pname','seatno','success','lsuc','ls1','ls2','ls3','lp',
            'lp2','lp3','sseatno','congress','congreso','afirst_listed_amicus_group','yearquarter','name','Name','State','j',
            'codej4','j4vote1','j4vote2','j4maj1','j4maj2','codej5','j5vote1','j5vote2','j5maj1','j5maj2',
            'codej6','j6vote1','j6vote2','j6maj1','j6maj2','codej7','j7vote1','j7vote2','j7maj1','j7maj2',
            'codej8','j8vote1','j8vote2','j8maj1','j8maj2','codej9','j9vote1','j9vote2','j9maj1','j9maj2',
            'codej10','j10vote1','j10vote2','j10maj1','j10maj2','codej11','j11vote1','j11vote2','j11maj1','j11maj2',
            'codej12','j12vote1','j12vote2','j12maj1','j12maj2','codej13','j13vote1','j13vote2','j13maj1','j13maj2',
            'codej14','j14vote1','j14vote2','j14maj1','j14maj2','codej15','j15vote1','j15vote2','j15maj1','j15maj2','j16maj1','j16vote1']
    df.drop(labels=del_cols,axis=1,inplace=True)
    moredropcolumns=df.columns.tolist() # .tolist?
    for i in moredropcolumns:
        if len(pd.unique(df[i]))==1:
            df.drop(labels=i,axis=1,inplace=True)
    df.drop(labels=['casenum','j2vote1','j2vote2','j2maj1','direct1',
                          'j2maj2','j3vote1','j3vote2','j3maj1','j3maj2','majvotes','ids'],axis=1,inplace=True)
    return df
    
def dummify(df):
    new_cols=df.columns
    new_cols=new_cols.tolist()
    keep_cols=['j1score','j2score','j3score','popularpct','electoralpct','closerd','fartherd','dAds3','dF2Ads3',
           'dF1Ads3','dL1Ads3','dL2Ads3','dL3Ads3','dL4Ads3','dL5Ads3','logAds3','logL1Ads3','logL2Ads3','logF1Ads3',
          'logF2Ads3','decade2','propneg','likely_elev2','score','d12','d13','d23']
    for col in keep_cols:
        if col in new_cols:
            new_cols.remove(col)
    df2=pd.get_dummies(df,columns=new_cols,dummy_na=True,sparse=False)
    df2=df2.fillna(value=0)
    return df2

def remove_bad_rows(df):
    
    #remove rows where codej1==codej2
#     df[df.codej1==df.codej2].index
    same_cols = df[df.codej1==df.codej2].index
    df=df.drop(same_cols).reset_index(drop=True)
    
    #remove rows where >3 judges occur
#     pp = pd.read_csv('../raw/Votelevel_stuffjan2013.csv')
#     qq=pp.groupby(by=['casenum']).count()
#     pd.unique(qq.month)
#     rr=qq[qq.month==6].reset_index()
#     rr.shape
    
    #remove rows where codej2==null
    #df[map(lambda x: not(x),pd.notnull(df.ix[:]["codej2"]).tolist())]
    nan_cols=df[map(lambda x: not(x),pd.notnull(df.ix[:]["codej2"]).tolist())].index
    nan_cols.append(df[map(lambda x: not(x),pd.notnull(df.ix[:]["codej1"]).tolist())].index)
    df=df.drop(nan_cols).reset_index(drop=True)
    
    return df

### Feat 1: If sat together previously

In [133]:
df_x, df_y = get_data()

In [157]:
def ret_datetime(yr,month,date):
    from datetime import datetime
    return datetime(int(yr),int(month),int(date))

def return_sat_together_count(df):
    """
    Arguments:
    df: dataframe of judge frames
    Returns: None.
    Appends column with sat_together_count feature.
    """
    ###assumes df.time, df.month, df.day present
    df['datetime'] = df.apply(lambda row: ret_datetime(row['year'], row['month'], row['day']), axis=1)
    df['sat_together_count']=0
    #df['judge_pairs']=df.apply(lambda row: tuple([row['codej1'],row['codej2']]),axis=1)
    df['judge_pairs']=pd.Series(zip(df.codej1.values,df.codej2.values))
    for ind,pair in enumerate(df.judge_pairs):
        templist=[]
        for ind2,pair2 in enumerate(df.judge_pairs):
            if set(pair)==set(pair2):
                if df.ix[ind2,'datetime']<df.ix[ind,'datetime']:
                    templist.append(df.ix[ind2,'datetime'])
        df.ix[ind,'sat_together_count']=len(set(templist)) #df.ix[ind,'sat_together_count']+1
    df.drop(labels=['judge_pairs','datetime'],axis=1,inplace=True)
    return df
    

In [135]:
df2=df_x.select(lambda x: x=='year' or x=='day' or x=='month' or x=='codej1' or x=='codej2' or x=='casenum',axis=1)

In [136]:
df2.head()

Unnamed: 0,casenum,codej1,codej2,day,month,year
0,12954,10358,305,10,11,1939
1,12954,10358,307,10,11,1939
2,12954,305,307,10,11,1939
3,12954,305,10358,10,11,1939
4,12954,307,305,10,11,1939


In [81]:
df2['datetime'] = df2.apply(lambda row: ret_datetime(row['year'], row['month'], row['day']), axis=1)

In [137]:
df2['judge_pairs']=pd.Series(zip(df2.codej1.values,df2.codej2.values))

In [138]:
llist=list(set(df2.judge_pairs))

In [142]:
llist[0][0]

540.0

In [119]:
df3=df2.query('(codej1=='+str(llist[0][0])+'and codej2==553) or (codej1==553 and codej2==540)')

In [123]:
(df3.datetime)

59427   1969-12-22
59429   1969-12-22
61077   1975-05-01
61079   1975-05-01
61741   1969-02-03
61744   1969-02-03
61776   1971-06-03
61780   1971-06-03
61788   1978-02-06
61791   1978-02-06
62003   1972-03-08
62006   1972-03-08
62008   1971-06-25
62012   1971-06-25
62050   1972-01-31
62054   1972-01-31
62122   1970-09-22
62126   1970-09-22
62146   1971-11-18
62150   1971-11-18
62200   1972-02-28
62204   1972-02-28
62248   1971-01-05
62253   1971-01-05
62255   1970-05-13
62258   1970-05-13
62537   1970-08-18
62540   1970-08-18
62741   1970-01-08
62744   1970-01-08
62770   1971-11-08
62775   1971-11-08
62825   1972-01-03
62829   1972-01-03
62884   1972-01-31
62888   1972-01-31
62926   1973-02-14
62928   1973-02-14
Name: datetime, dtype: datetime64[ns]

In [128]:
df3.datetime

59427   1969-12-22
59429   1969-12-22
61077   1975-05-01
61079   1975-05-01
61741   1969-02-03
61744   1969-02-03
61776   1971-06-03
61780   1971-06-03
61788   1978-02-06
61791   1978-02-06
62003   1972-03-08
62006   1972-03-08
62008   1971-06-25
62012   1971-06-25
62050   1972-01-31
62054   1972-01-31
62122   1970-09-22
62126   1970-09-22
62146   1971-11-18
62150   1971-11-18
62200   1972-02-28
62204   1972-02-28
62248   1971-01-05
62253   1971-01-05
62255   1970-05-13
62258   1970-05-13
62537   1970-08-18
62540   1970-08-18
62741   1970-01-08
62744   1970-01-08
62770   1971-11-08
62775   1971-11-08
62825   1972-01-03
62829   1972-01-03
62884   1972-01-31
62888   1972-01-31
62926   1973-02-14
62928   1973-02-14
Name: datetime, dtype: datetime64[ns]

In [127]:
df3.query('datetime==1969-12-22')

ValueError: unkown type object

In [89]:
for x in llist[:1]:
    temp=df2.query('(codej1=='+str(x[0])+'and codej2=='+str(x[1])+') or (codej1=='+str(x[1])+' and codej2=='+str(x[0])+')')
    #temp=df2.query('set([codej1,codej2])==x') #slice
    templist=list(set(temp.judge_pairs))
    for temper in templist:
        temp2=temp.query('datetime=='+str(temper))
    s_ind=np.argsort(df2.datetime.values) #sorted index
    for 

Unnamed: 0,casenum,codej1,codej2,day,month,year,datetime,sat_together_count,judge_pairs
0,12954,10358,305,10,11,1939,1939-11-10,0,"(10358.0, 305.0)"
1,12954,10358,307,10,11,1939,1939-11-10,0,"(10358.0, 307.0)"
2,12954,305,307,10,11,1939,1939-11-10,10,"(305.0, 307.0)"
3,12954,305,10358,10,11,1939,1939-11-10,0,"(305.0, 10358.0)"
4,12954,307,305,10,11,1939,1939-11-10,10,"(307.0, 305.0)"


In [143]:
df2.head(10)

Unnamed: 0,casenum,codej1,codej2,day,month,year,judge_pairs
0,12954,10358,305,10,11,1939,"(10358.0, 305.0)"
1,12954,10358,307,10,11,1939,"(10358.0, 307.0)"
2,12954,305,307,10,11,1939,"(305.0, 307.0)"
3,12954,305,10358,10,11,1939,"(305.0, 10358.0)"
4,12954,307,305,10,11,1939,"(307.0, 305.0)"
5,12954,307,10358,10,11,1939,"(307.0, 10358.0)"
6,14244,10140,103,23,12,1931,"(10140.0, 103.0)"
7,14244,10140,116,23,12,1931,"(10140.0, 116.0)"
8,14244,116,103,23,12,1931,"(116.0, 103.0)"
9,14244,116,10140,23,12,1931,"(116.0, 10140.0)"


In [95]:
np.argsort(df2.datetime[:10].values)

array([6, 7, 8, 9, 0, 1, 2, 3, 4, 5])

In [92]:
sorted(df2.datetime[:10].values)

[numpy.datetime64('1931-12-23T00:00:00.000000000Z'),
 numpy.datetime64('1931-12-23T00:00:00.000000000Z'),
 numpy.datetime64('1931-12-23T00:00:00.000000000Z'),
 numpy.datetime64('1931-12-23T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z'),
 numpy.datetime64('1939-11-10T00:00:00.000000000Z')]

In [None]:

def rec_return_sat_together_count(df):
    """
    Arguments:
    df: dataframe of judge frames
    Returns: None.
    Appends column with sat_together_count feature.
    """
    ###assumes df.time, df.month, df.day present
    df['datetime'] = df.apply(lambda row: ret_datetime(row['year'], row['month'], row['day']), axis=1)
    df['sat_together_count']=0
    #df['judge_pairs']=df.apply(lambda row: tuple([row['codej1'],row['codej2']]),axis=1)
    df['judge_pairs']=pd.Series(zip(df.codej1.values,df.codej2.values))
    llist=set(df.judge_pairs)
    for x in llist:
        temp=df.query('set([codej1,codej2])==x')
        np.argsort(df2.datetime[:10].values)
    
    
    for ind,pair in enumerate(df.judge_pairs):
        for ind2,pair2 in enumerate(df.judge_pairs):
            if set(pair)==set(pair2):
                if df.ix[ind2,'datetime']<df.ix[ind,'datetime']:
                    df.ix[ind,'sat_together_count']=df.ix[ind,'sat_together_count']+1
    df.drop(labels=['judge_pairs','datetime'],axis=1,inplace=True)
    return df
    

In [80]:
df2=return_sat_together_count(df2)

KeyboardInterrupt: 

In [None]:
df_x['sat_together_count']=df2['sat_together_count']

In [149]:
llist

[(540.0, 553.0),
 (104.0, 115.0),
 (1210.0, 110.0),
 (733.0, 708.0),
 (10287.0, 236.0),
 (1009.0, 11013.0),
 (502.0, 11180.0),
 (619.0, 10690.0),
 (714.0, 726.0),
 (513.0, 513.0),
 (933.0, 904.0),
 (515.0, 1103.0),
 (952.0, 20988.0),
 (555.0, 20521.0),
 (20977.0, 926.0),
 (1024.0, 11020.0),
 (421.0, 10446.0),
 (20622.0, 633.0),
 (639.0, 609.0),
 (106.0, 10152.0),
 (217.0, 240.0),
 (613.0, 626.0),
 (1222.0, 544.0),
 (302.0, 315.0),
 (612.0, 20620.0),
 (622.0, 640.0),
 (703.0, 10718.0),
 (710.0, 10734.0),
 (644.0, 20615.0),
 (20904.0, 941.0),
 (737.0, 731.0),
 (10950.0, 962.0),
 (912.0, 933.0),
 (403.0, 10401.0),
 (10343.0, 303.0),
 (936.0, 20972.0),
 (305.0, 928.0),
 (967.0, 961.0),
 (224.0, 208.0),
 (104.0, 10129.0),
 (1001.0, 1016.0),
 (528.0, 504.0),
 (921.0, 20937.0),
 (20618.0, 644.0),
 (1238.0, 1206.0),
 (560.0, 553.0),
 (11020.0, 1018.0),
 (728.0, 719.0),
 (735.0, 1240.0),
 (10510.0, 523.0),
 (603.0, 634.0),
 (236.0, 813.0),
 (826.0, 10332.0),
 (741.0, 728.0),
 (926.0, 30902.0),


In [19]:
test=[(10993.0, 229.0),(927.0, 959.0),(1120.0, 224.0),(11039.0, 1008.0),(104.0, 115.0)]

In [20]:
len(set(zip(df_x.codej1.values,df_x.codej2.values)))

18038

In [178]:
df2.query('(codej1==104.0 and codej2==115.0)')

Unnamed: 0,casenum,codej1,codej2,day,month,year,judge_pairs
7798,16037,104,115,27,3,1991,"(104.0, 115.0)"
13444,733,104,115,30,12,1986,"(104.0, 115.0)"
45153,743,104,115,9,1,1986,"(104.0, 115.0)"
45993,15344,104,115,20,11,1989,"(104.0, 115.0)"
62790,16047,104,115,8,10,1991,"(104.0, 115.0)"
79711,372,104,115,31,7,1987,"(104.0, 115.0)"
80893,362,104,115,23,6,1987,"(104.0, 115.0)"
82009,20749,104,115,24,11,1999,"(104.0, 115.0)"
83215,16042,104,115,23,7,1991,"(104.0, 115.0)"
83322,30,104,115,29,7,1988,"(104.0, 115.0)"


In [179]:
df4=df2.query('(codej1==959.0 and codej2==927.0) or (codej1==927.0 and codej2==959.0) or (codej1==104.0 and codej2==115.0) or (codej1==115.0 and codej2==104.0)').reset_index(drop=True)

In [180]:
df4

Unnamed: 0,casenum,codej1,codej2,day,month,year,judge_pairs
0,16037,104,115,27,3,1991,"(104.0, 115.0)"
1,16037,115,104,27,3,1991,"(115.0, 104.0)"
2,733,104,115,30,12,1986,"(104.0, 115.0)"
3,733,115,104,30,12,1986,"(115.0, 104.0)"
4,12554,959,927,26,5,1942,"(959.0, 927.0)"
5,12554,927,959,26,5,1942,"(927.0, 959.0)"
6,12552,959,927,17,2,1942,"(959.0, 927.0)"
7,12552,927,959,17,2,1942,"(927.0, 959.0)"
8,13045,959,927,1,6,1939,"(959.0, 927.0)"
9,13045,927,959,1,6,1939,"(927.0, 959.0)"


In [60]:
df2.to_csv('final_feats_without_dummies_3.csv')

In [11]:
df_x=remove_bad_rows(df_x)

In [166]:
df5[df5.index=np.argsort(df5.sat_together_count).values]

SyntaxError: invalid syntax (<ipython-input-166-76bd4c0246be>, line 1)

In [182]:
df5.ix[np.argsort(df5.sat_together_count).values]

Unnamed: 0,casenum,codej1,codej2,day,month,year,sat_together_count
64,1081,104,115,4,8,1985,0
23,13201,927,959,4,4,1938,0
22,13201,959,927,4,4,1938,0
65,1081,115,104,4,8,1985,0
20,13210,959,927,23,4,1938,1
21,13210,927,959,23,4,1938,1
36,743,104,115,9,1,1986,1
37,743,115,104,9,1,1986,1
25,13204,927,959,18,10,1938,2
24,13204,959,927,18,10,1938,2


In [181]:
df5=return_sat_together_count(df4)