In [3]:
import pandas as pd
import numpy as np
from copy import deepcopy
import time
from joblib import Parallel, delayed
import multiprocessing
import resource
import gc

In [4]:
def remove_column_from_data_frame(col_to_remove, data_frame):

    if col_to_remove in list(data_frame.columns):
        data_frame.drop(col_to_remove, axis=1, inplace=True)

In [5]:
def remove_columns_from_data_frame(cols_to_remove, data_frame):

    column_dict = {x: None for x in list(data_frame.columns)}

    cols_to_remove = [x for x in cols_to_remove if x in column_dict]

    df.drop(labels=cols_to_remove, axis=1, inplace=True)

In [6]:
def print_resource_usage():
    
    print (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / (1.0 * 1e6)

In [7]:
def get_redundant_columns(data_frame, to_exclude):

    return [x for x in list(data_frame.columns) 
            if (len(pd.unique(data_frame[x])) == 1 and x not in to_exclude)]

In [8]:
def get_drop_list(data_frame):

    droplist = ['docketnumber','citation','JudgeCONCURRING','JudgeDissentingTouse',
                'songername', 'jOrigname','dissentdate','JudgesListTouse',
                'Judgeconcurring','dateamended','month','day','AmendedDate',
                'distanceAM','quartertoelectAM','JudgeDISSENTING1','JudgeDISSENTING2',
                'Author','AppointmentDate1','TerminationDate1','RecessAppointDate1',
                'AppointmentDate2','TerminationDate2','RecessAppointDate2',
                'AppointmentDate3','TerminationDate3','AppointmentDate','TerminationDate',
                'SenateConfirmationdate','RecessAppointDate','birthday','birthyear',
                'birthmonth','deathmonth','deathday','deathyear','judgelastname',
                'judgefirstname','judgemiddlename','retirementfromactiveservice','degreeyear1',
                'degreeyear2','degreeyear3','degreeyear4','vicelastnamepredecessor',
                'vicefirstnamepredecessor','hearings','placeofdeathcity','deathdate',
                'dateoftermination']

    droplist += ['MajOpinionWordCount','MajSelfCertainWords','minOpinionWordCount1',
                'MinSelfCertainWords1','ConcurenceWordCount1','ConcurSelfCertainWords1',
                'minOpinionWordCount2','MinSelfCertainWords2','ConcurenceWordCount2',
                'ConcurSelfCertainWords2','senatevoteayesnays']

    droplist += ['dissentOrconcurCaseid','yearq','Circuitjudge1','Circuitjudge2','id',
                'nominationdatesenateexecutivej','recessappointmentdate', 'committeeactiondate',
                'senatevotedateconfirmationdate','commissiondate','startdate','BecameSenior']

    droplist += ['RecessAppointDate4','AppointmentDate5','TerminationDate5',
                'RecessAppointDate5','AppointmentDate6','TerminationDate6',
                'RecessAppointDate6','RecessAppointDate3']

    return droplist + get_redundant_columns(data_frame, ['Dissenting1'])

In [9]:
def drop_columns(data_frame):

    remove_columns_from_data_frame(get_drop_list(data_frame), data_frame)


In [10]:
def for_voting(caseLIST, st, end):

#     if end > Length:
#          end = Length
            
#     print st,end
    
    newframe=pd.DataFrame()
    output=[]
    for case in caseList:
        temper=np.where(df.caseid==case)
        temper=(temper[0]).tolist()
        for i in range(len(temper)):
            for j in [1,2]:            
                for term in copylist:
                    name='%sANO'%(term)
                    df.loc[temper[i],name]=df.ix[temper[i-j],term]
                newframe=newframe.append(df.ix[temper[i]])
        if df.loc[temper[0],'Dissenting1']==0:      ## if dissenting is 0, no one disagree
           output=output+[0,0,0,0,0,0]
        else :
           a=df.loc[temper[0],'Dissenting1']        ## otherwise, find the one with 'j' value equal to Dissent1
           for i in range(len(temper)):             ##  'j' value correspond to the judge in that row, 
               if a==df.loc[temper[i],'j']:         ## take values 1,2,3
                   output=output+[1,1]
               elif a==df.loc[temper[i-1],'j']:
                   output=output+[1,0]
               elif a==df.loc[temper[i-2],'j']:
                   output=output+[0,1]
    
    
    assert newframe.shape[0]==len(output)   
    filename="tryvoting%i.csv"%(st)
    filename2="tryvoteoutput%i.csv"%(st)
    newframe.to_csv(filename)
    (pd.DataFrame(output)).to_csv(filename2)
    return output

In [11]:
def run_parallel_jobs():

    numarray=[]
    i = 0
    while i < Length:
        numarray.append(i)
        i = i + 1000

    print numarray

    num_cores = multiprocessing.cpu_count()

    jobs = Parallel(n_jobs=num_cores)(delayed(for_voting)(caseList, a, a + 1000) for a in numarray)

In [12]:
def main():

    df = pd.read_csv('BloombergVOTELEVEL_Touse_mini.csv', nrows=42000)

    print_resource_usage()
    
    # This value tells which judge (1, 2 or 3) disagree the other
    df.Dissenting1 = df.Dissenting1.fillna(value=0)

    copylist = get_columns_after_feature(df, 'Term')
    copylist.append('judgeidentificationnumber')

    caseList = pd.unique(df['caseid'])
    caseList = caseList[pd.notnull(caseList)].tolist()

    Length = len(caseList)

In [13]:
def get_columns_after_feature(data_frame, feature):
    
    columns = list(df.columns)
    
    return df.columns.tolist()[columns.index(feature):]
    

In [14]:
def isNaN(num):
    return num != num

In [15]:
df = pd.read_csv('BloombergVOTELEVEL_Touse_mini.csv', low_memory=False, nrows=100)

In [16]:
drop_columns(df)

In [17]:
caseList = pd.unique(df['caseid'])
caseList = caseList[pd.notnull(caseList)].tolist()

In [18]:
df.Dissenting1 = df.Dissenting1.fillna(value=0)

In [19]:
copylist = get_columns_after_feature(df, 'Term')
copylist.append('judgeidentificationnumber')

In [23]:
# %%time

newframe=pd.DataFrame()

output=[]

for case in caseList:
    
    temper = np.where(df.caseid==case)
    temper = (temper[0]).tolist()

    to_append = []
    
    for i in range(len(temper)):

        for j in [1,2]:
            for term in copylist:
                name='%sANO'%(term)
                df.loc[temper[i], name] = df.ix[ temper[i - j], term ]
            
            to_append.append(df.ix[temper[i]])
            
    newframe = newframe.append(to_append)

    if df.loc[temper[0],'Dissenting1'] == 0:      ## if dissenting is 0, no one disagree
       output=output+[0,0,0,0,0,0]
    else :
       a = df.loc[temper[0],'Dissenting1']        ## otherwise, find the one with 'j' value equal to Dissent1
       for i in range(len(temper)):               ## 'j' value correspond to the judge in that row, 
           if a == df.loc[temper[i],'j']:         ## take values 1,2,3
               output = output + [1,1]
           elif a==df.loc[temper[i-1],'j']:
               output=output+[1,0]
           elif a==df.loc[temper[i-2],'j']:
               output=output+[0,1]
                
# newframe.to_csv('test.csv')
# np.savetxt('test.out.csv', output, delimiter=',') 

IndexError: list index out of range