# Cleaning.ipynb
#### This script cleans and merges relevant variables across datasets
#### Currently, it is doing cleaning and merging only for the baseline model.

In [2]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [3]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [77]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [78]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()

In [79]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[((app["Appl_Dec"] == 'G') |( app["Appl_Dec"] == 'F' ) 
             | (app["Appl_Dec"] == 'N') | (app["Appl_Dec"] == 'L')
            | (app["Appl_Dec"] == 'C')),'dec'] = 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

#app.count()

In [80]:
# only keep applications of type ASYL, ASYW, WCAT. sort in this order within idnproceeding
app = app[app.Appl_Code.isin(['ASYL','ASYW', 'WCAT'])]
app = app.sort_values(['idnProceeding','Appl_Code'])

# drop people who have multiple idnproceedings of the same case type (about 1000 of these) 
# because it is unclear how these should be handled
app = app.drop_duplicates(subset= ['idnProceeding','Appl_Code'])

In [81]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [8]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
836171,837950,75.0,3328085.0,ASYW,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
836173,837952,85.0,3328111.0,ASYW,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
836186,837965,136.0,3327844.0,ASYW,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
836188,837967,139.0,3327852.0,ASYW,1996-05-14 00:00:00,G,2.0,GRANT


In [82]:
# make unique--take the first application for each proceeding, when sorted in order ASYL, ASYW, WCAT

# NOTE: what if someone has multiple ASYL apps, then how do we choose?
app2 = app.groupby('idnproceeding', as_index=False).first()
#print(app_unique_idnp)
#app_unique_idnp.count()

In [10]:
app2.count()

idnproceeding         614388
idnProceedingAppln    614388
idncase               614388
Appl_Code             614388
Appl_Recd_Date        614387
Appl_Dec              614388
numAppsPerProc        614388
dec                   614388
dtype: int64

In [11]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [83]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)

In [13]:
# change variables to categorical for descriptive stats
#master['idncase'] = master['idncase'].astype('category')
#master['tracid'] = master['tracid'].astype('category')

#master.describe() # summary stats

In [84]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [86]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()

In [87]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

In [88]:
master.head(10)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
11,2046920.0,MX,RMV,,3200048,CHI,CHD,O,X,,2004-08-06,11AUG2004,2004-08-11,,RDV,31.0
12,2046921.0,MX,RMV,,3200049,CHI,CHD,O,X,,2004-08-06,10AUG2004,2004-08-11,,RDV,31.0
13,2046922.0,MX,RMV,,3200050,CHI,CHD,O,X,,2004-08-09,19AUG2004,2004-08-19,,JLG,29.0
14,2046923.0,PL,RMV,,3200051,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-25,1.0,CC,27.0
15,2046923.0,PL,RMV,,3525150,CHI,CHD,,,T,2004-08-09,30MAR2005,2005-04-13,1.0,GPK,30.0
16,2046923.0,PL,RMV,,3538044,CHI,CHI,O,R,,2004-08-09,13APR2005,2007-06-04,1.0,CC,27.0
17,2046924.0,MX,RMV,,3200052,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-13,,RDV,31.0
18,2046925.0,MX,RMV,,3200053,CHI,CHD,O,X,,2004-08-10,19AUG2004,2004-08-19,,JLG,29.0
19,2046926.0,MX,RMV,,3200054,CHI,CHD,O,X,,2004-08-10,16AUG2004,2004-08-30,,CMZ,32.0
20,2046927.0,MX,RMV,,3200055,CHI,CHD,O,X,,2004-08-12,19AUG2004,2004-08-19,,JLG,29.0


In [89]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [90]:
# define master2, which only has variables of interest
master2 = master[['idncase','idnproceeding', 'osc_date', 'comp_date','tracid', 'nat']]

In [91]:
master2.head(10)

Unnamed: 0,idncase,idnproceeding,osc_date,comp_date,tracid,nat
11,2046920.0,3200048,2004-08-06,2004-08-11,31.0,MX
12,2046921.0,3200049,2004-08-06,2004-08-11,31.0,MX
13,2046922.0,3200050,2004-08-09,2004-08-19,29.0,MX
14,2046923.0,3200051,2004-08-09,2004-08-25,27.0,PL
15,2046923.0,3525150,2004-08-09,2005-04-13,30.0,PL
16,2046923.0,3538044,2004-08-09,2007-06-04,27.0,PL
17,2046924.0,3200052,2004-08-09,2004-08-13,31.0,MX
18,2046925.0,3200053,2004-08-10,2004-08-19,29.0,MX
19,2046926.0,3200054,2004-08-10,2004-08-30,32.0,MX
20,2046927.0,3200055,2004-08-12,2004-08-19,29.0,MX


In [92]:
master2.count()

idncase          5716359
idnproceeding    5716359
osc_date         5716359
comp_date        5716359
tracid           5393477
nat              5714180
dtype: int64

In [93]:
master2['idnproceeding'] = master2['idnproceeding'].astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Merge datasets

In [35]:
#to do:
#clean 3 columns more?
#check judge numbers
#multiple proceedings per case?
#one hot encoding
#set aside test data
#run model

In [94]:
merged = pd.merge(app2, master2, on='idnproceeding')

In [95]:
merged.count()

idnproceeding         613768
idnProceedingAppln    613768
idncase_x             613768
Appl_Code             613768
Appl_Recd_Date        613767
Appl_Dec              613768
numAppsPerProc        613768
dec                   613768
idncase_y             613768
osc_date              613768
comp_date             613768
tracid                597701
nat                   613576
dtype: int64

In [96]:
merged.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase_x,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,idncase_y,osc_date,comp_date,tracid,nat
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY,3328085.0,1994-11-03,1995-03-10,,HO
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY,3328111.0,1994-11-04,1997-06-16,71.0,HO
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT,3328153.0,1994-11-05,1995-08-08,139.0,GT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY,3328175.0,1994-11-05,1995-08-15,70.0,ES
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY,3327844.0,1995-02-06,1995-04-06,50.0,HO
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT,3327852.0,1995-02-06,1996-05-14,126.0,CU
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,3327869.0,1995-02-10,1996-01-04,71.0,NU
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,3327877.0,1995-02-10,1996-01-04,71.0,NU
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY,3327884.0,1995-02-12,1995-06-23,61.0,NU
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY,3327927.0,1995-02-17,1995-09-05,70.0,NU


## Save data

In [50]:
merged.to_csv('merged_master_app.csv')

In [97]:
#load data 
#merged = pd.read_csv('merged_master_app.csv')
#drop nan tracids and nat
merged = merged.dropna(subset=['tracid','nat'])
merged.count()


idnproceeding         597524
idnProceedingAppln    597524
idncase_x             597524
Appl_Code             597524
Appl_Recd_Date        597523
Appl_Dec              597524
numAppsPerProc        597524
dec                   597524
idncase_y             597524
osc_date              597524
comp_date             597524
tracid                597524
nat                   597524
dtype: int64

In [98]:
 merged.groupby('tracid').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase_x,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,idncase_y,osc_date,comp_date,nat
tracid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,4905,4905,4905,4905,4905,4905,4905,4905,4905,4905,4905,4905
2.0,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247
3.0,4195,4195,4195,4195,4195,4195,4195,4195,4195,4195,4195,4195
4.0,5732,5732,5732,5732,5732,5732,5732,5732,5732,5732,5732,5732
5.0,878,878,878,878,878,878,878,878,878,878,878,878
6.0,624,624,624,624,624,624,624,624,624,624,624,624
7.0,5197,5197,5197,5197,5197,5197,5197,5197,5197,5197,5197,5197
8.0,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503
9.0,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312
10.0,5751,5751,5751,5751,5751,5751,5751,5751,5751,5751,5751,5751


In [99]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
merged2.count()

idnproceeding         596581
idnProceedingAppln    596581
idncase_x             596581
Appl_Code             596581
Appl_Recd_Date        596580
Appl_Dec              596581
numAppsPerProc        596581
dec                   596581
idncase_y             596581
osc_date              596581
comp_date             596581
tracid                596581
nat                   596581
dtype: int64

In [104]:
#merged2['osc_date'] = pd.to_datetime(merged2['osc_date']) # change to date format 

#there are 6377 osc dates before 1985, but I thought 1985 was supposed to be earliest year. 
merged2[merged2.osc_date.dt.year<1985]
#when osc year is pre 1985, it is often many years before the appl_recd_date. some may be error, but some may 
#actually have occurred well before the court date?
#merged2 = merged2[merged2.osc_year>1984]

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase_x,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,idncase_y,osc_date,comp_date,tracid,nat,osc_year
12876,91492.0,27740,2048164.0,ASYL,1985-03-04 00:00:00,D,5.0,DENY,2048164.0,1984-03-13,1987-10-29,10.0,??,1984
12879,91808.0,2477961,2048117.0,WCAT,2003-08-14 11:14:00,D,3.0,DENY,2048117.0,1974-05-31,2005-12-23,108.0,GE,1974
12909,93318.0,28337,2048737.0,ASYL,1983-06-09 00:00:00,D,3.0,DENY,2048737.0,1983-05-25,1986-08-26,305.0,NU,1983
12910,93370.0,28338,2049203.0,ASYL,1988-03-31 00:00:00,D,3.0,DENY,2049203.0,1984-03-21,1988-12-01,202.0,ES,1984
12919,94058.0,28353,2049102.0,ASYL,1988-08-01 00:00:00,D,2.0,DENY,2049102.0,1984-03-14,1988-09-28,166.0,CU,1984
12921,94207.0,28358,2049493.0,ASYL,1985-07-03 00:00:00,D,2.0,DENY,2049493.0,1974-03-24,1987-04-06,1.0,CU,1974
12923,94262.0,28361,2048654.0,ASYL,1985-03-27 00:00:00,D,2.0,DENY,2048654.0,1976-07-20,1985-09-25,1.0,CU,1976
12945,95949.0,27422,2049605.0,ASYL,1981-01-06 00:00:00,G,4.0,GRANT,2049605.0,1973-01-16,1987-04-03,39.0,NU,1973
13006,97177.0,27588,2050079.0,ASYL,1996-06-19 00:00:00,D,5.0,DENY,2050079.0,1971-04-14,1996-10-08,250.0,CU,1971
13049,98428.0,27648,2051278.0,ASYL,1987-03-30 00:00:00,G,2.0,GRANT,2051278.0,1960-10-25,1987-08-03,153.0,SY,1960


In [116]:
#look at comp date--626 dates before 1985. drop these.
len(merged2[merged2.comp_date.dt.year<1985])
merged2 = merged2[merged2.comp_date.dt.year>1984]
#save
merged2.to_csv('merged2_master_app.csv')

In [114]:
merged2['comp_year'] = merged2.comp_date.dt.year
judge_ranges = merged2.groupby('tracid')['comp_year'].apply(np.ptp)
print(len(judge_ranges))
print(len(judge_ranges[judge_ranges>10])) #323 out of 371 judges have at least 10 years of cases



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


371
323


In [7]:
merged2 = pd.read_csv('/home/emilyboeke/merged2_master_app.csv')


In [20]:
counts = merged2.groupby(['tracid','comp_year'])['idnproceeding'].count()
counts_50 = counts[counts>49]
counts_50.index.values[2][:]


#plan: bin by 2 years. how many jduges have 10 consecutive years with at least 100 cases every 2 years?

(1.0, 1987)