# Cleaning_full_asylum.ipynb
#### This script cleans and merges relevant variables across datasets
####  it only considers proceedings/cases associated with full asylum applications. creates a dataset unique at the idncase level  (within proceeding, it picks the asylum application and within case, it picks the proceeding associated with the asylum application, prioritizing by date --most recent first)
#### Currently, it is doing cleaning and merging only for the baseline model.

In [4]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [6]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [3]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [4]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()

In [5]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[((app["Appl_Dec"] == 'G') |( app["Appl_Dec"] == 'F' ) 
             | (app["Appl_Dec"] == 'N') | (app["Appl_Dec"] == 'L')
            | (app["Appl_Dec"] == 'C')),'dec'] = 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

#app.count()


In [19]:
# only keep applications of type ASYL. sort by date within idnproceeding
#sorting by date--if there are multiple applications with the same decision with the same case type, 
#take the most recent one.
app = app[app.Appl_Code.isin(['ASYL'])]

#sort multiple times because some need to be ascending and some descending
app = app.sort_values(['idnProceeding','Appl_Recd_Date'],ascending=[True,False])
len(app)

573944

In [14]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [15]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
59,60,145.0,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
60,61,147.0,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
61,62,149.0,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
62,63,159.0,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


In [20]:
# make unique--take the first application for each proceeding, when sorted in order dec (grant deny),
#case type(ASYL, ASYW, WCAT), date
app2 = app.groupby('idnproceeding', as_index=False).first()
#print(app_unique_idnp)
#app_unique_idnp.count()

In [21]:
app2.count()

idnproceeding         573565
idnProceedingAppln    573565
idncase               573565
Appl_Code             573565
Appl_Recd_Date        573563
Appl_Dec              573565
numAppsPerProc        573565
dec                   573565
dtype: int64

In [22]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [24]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)

In [13]:
# change variables to categorical for descriptive stats
#master['idncase'] = master['idncase'].astype('category')
#master['tracid'] = master['tracid'].astype('category')

#master.describe() # summary stats

In [25]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [26]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()

In [27]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

In [28]:
master.head(10)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
11,2046920.0,MX,RMV,,3200048,CHI,CHD,O,X,,2004-08-06,11AUG2004,2004-08-11,,RDV,31.0
12,2046921.0,MX,RMV,,3200049,CHI,CHD,O,X,,2004-08-06,10AUG2004,2004-08-11,,RDV,31.0
13,2046922.0,MX,RMV,,3200050,CHI,CHD,O,X,,2004-08-09,19AUG2004,2004-08-19,,JLG,29.0
14,2046923.0,PL,RMV,,3200051,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-25,1.0,CC,27.0
15,2046923.0,PL,RMV,,3525150,CHI,CHD,,,T,2004-08-09,30MAR2005,2005-04-13,1.0,GPK,30.0
16,2046923.0,PL,RMV,,3538044,CHI,CHI,O,R,,2004-08-09,13APR2005,2007-06-04,1.0,CC,27.0
17,2046924.0,MX,RMV,,3200052,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-13,,RDV,31.0
18,2046925.0,MX,RMV,,3200053,CHI,CHD,O,X,,2004-08-10,19AUG2004,2004-08-19,,JLG,29.0
19,2046926.0,MX,RMV,,3200054,CHI,CHD,O,X,,2004-08-10,16AUG2004,2004-08-30,,CMZ,32.0
20,2046927.0,MX,RMV,,3200055,CHI,CHD,O,X,,2004-08-12,19AUG2004,2004-08-19,,JLG,29.0


In [29]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [30]:
# define master2, which only has variables of interest
master2 = master[['idncase','idnproceeding', 'osc_date', 'comp_date','tracid', 'nat']]

In [31]:
master2.head(10)

Unnamed: 0,idncase,idnproceeding,osc_date,comp_date,tracid,nat
11,2046920.0,3200048,2004-08-06,2004-08-11,31.0,MX
12,2046921.0,3200049,2004-08-06,2004-08-11,31.0,MX
13,2046922.0,3200050,2004-08-09,2004-08-19,29.0,MX
14,2046923.0,3200051,2004-08-09,2004-08-25,27.0,PL
15,2046923.0,3525150,2004-08-09,2005-04-13,30.0,PL
16,2046923.0,3538044,2004-08-09,2007-06-04,27.0,PL
17,2046924.0,3200052,2004-08-09,2004-08-13,31.0,MX
18,2046925.0,3200053,2004-08-10,2004-08-19,29.0,MX
19,2046926.0,3200054,2004-08-10,2004-08-30,32.0,MX
20,2046927.0,3200055,2004-08-12,2004-08-19,29.0,MX


In [32]:
master2.count()

idncase          5716359
idnproceeding    5716359
osc_date         5716359
comp_date        5716359
tracid           5393477
nat              5714180
dtype: int64

In [33]:
master2['idnproceeding'] = master2['idnproceeding'].astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Merge datasets

In [34]:
merged = pd.merge(app2, master2, on=['idnproceeding','idncase'])

In [35]:
merged.count()

idnproceeding         572966
idnProceedingAppln    572966
idncase               572966
Appl_Code             572966
Appl_Recd_Date        572964
Appl_Dec              572966
numAppsPerProc        572966
dec                   572966
osc_date              572966
comp_date             572966
tracid                557130
nat                   572783
dtype: int64

In [36]:
merged.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,tracid,nat
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY,1994-11-03,1995-03-10,,HO
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY,1994-11-04,1997-06-16,71.0,HO
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT,1994-11-05,1995-08-08,139.0,GT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY,1994-11-05,1995-08-15,70.0,ES
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY,1995-02-06,1995-04-06,50.0,HO
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT,1995-02-06,1996-05-14,126.0,CU
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY,1995-02-12,1995-06-23,61.0,NU
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY,1995-02-17,1995-09-05,70.0,NU


## Save data

In [50]:
merged.to_csv('merged_master_app.csv')

In [37]:
#load data 
#merged = pd.read_csv('merged_master_app.csv')
#drop nan tracids and nat
merged = merged.dropna(subset=['tracid','nat'])
merged.count()


idnproceeding         556962
idnProceedingAppln    556962
idncase               556962
Appl_Code             556962
Appl_Recd_Date        556960
Appl_Dec              556962
numAppsPerProc        556962
dec                   556962
osc_date              556962
comp_date             556962
tracid                556962
nat                   556962
dtype: int64

In [38]:
 merged.groupby('tracid').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,nat
tracid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,4701,4701,4701,4701,4701,4701,4701,4701,4701,4701,4701
2.0,1178,1178,1178,1178,1178,1178,1178,1178,1178,1178,1178
3.0,4018,4018,4018,4018,4018,4018,4018,4018,4018,4018,4018
4.0,5521,5521,5521,5521,5521,5521,5521,5521,5521,5521,5521
5.0,814,814,814,814,814,814,814,814,814,814,814
6.0,571,571,571,571,571,571,571,571,571,571,571
7.0,5079,5079,5079,5079,5079,5079,5079,5079,5079,5079,5079
8.0,3341,3341,3341,3341,3341,3341,3341,3341,3341,3341,3341
9.0,2182,2182,2182,2182,2182,2182,2182,2182,2182,2182,2182
10.0,5579,5579,5579,5579,5579,5579,5579,5579,5579,5579,5579


In [39]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
merged2.count()

idnproceeding         555810
idnProceedingAppln    555810
idncase               555810
Appl_Code             555810
Appl_Recd_Date        555808
Appl_Dec              555810
numAppsPerProc        555810
dec                   555810
osc_date              555810
comp_date             555810
tracid                555810
nat                   555810
dtype: int64

In [40]:
#look at comp date--~600 dates before 1985. drop these.
len(merged2[merged2.comp_date.dt.year<1985])
merged2 = merged2[merged2.comp_date.dt.year>1984]
merged2.count()

idnproceeding         555185
idnProceedingAppln    555185
idncase               555185
Appl_Code             555185
Appl_Recd_Date        555183
Appl_Dec              555185
numAppsPerProc        555185
dec                   555185
osc_date              555185
comp_date             555185
tracid                555185
nat                   555185
dtype: int64

In [41]:
merged2.groupby('dec').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DENY,353837,353837,353837,353837,353837,353837,353837,353837,353837,353837,353837
GRANT,201348,201348,201348,201348,201346,201348,201348,201348,201348,201348,201348


In [43]:
# adding additional feature based on how many full asylum proceedings have been filed for the same (idnCase) 
merged2['numProcPerCase'] = 1
merged2['numProcPerCase'] = merged2['numProcPerCase'].astype('int64')
merged2['numProcPerCase'] = merged2.groupby(['idncase'])['numProcPerCase'].transform('count')

#make unique at idncase level, sorting with the same logic as used to sort applications
merged_case = merged2.sort_values(['idncase','Appl_Recd_Date'],ascending=[True,False])
merged_case = merged_case.groupby('idncase',as_index=False ).first()

In [44]:
merged_case.count()

idncase               531214
idnproceeding         531214
idnProceedingAppln    531214
Appl_Code             531214
Appl_Recd_Date        531212
Appl_Dec              531214
numAppsPerProc        531214
dec                   531214
osc_date              531214
comp_date             531214
tracid                531214
nat                   531214
numProcPerCase        531214
dtype: int64

In [45]:
merged_case.groupby('dec').count()

Unnamed: 0_level_0,idncase,idnproceeding,idnProceedingAppln,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat,numProcPerCase
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DENY,333775,333775,333775,333775,333775,333775,333775,333775,333775,333775,333775,333775
GRANT,197439,197439,197439,197439,197437,197439,197439,197439,197439,197439,197439,197439


In [13]:
#merged_case = pd.read_csv('/home/emilyboeke/merged_full_asylum_master_app.csv')
#clean nationality data
#drop 159 cases with unknown nationalities
merged_case = merged_case.loc[~(merged_case.nat=='??')]
merged_case.count()
#load nationality lookup table
nat_lut =  pd.read_csv(path+ '/tblLookupNationality.csv',header=None)
#drop 4 observations where the nationality code is not in the lookup table
merged_case = merged_case.loc[merged_case.nat.isin(nat_lut[1])]

#drop 2 observations with nationality code XX whic the LUT says corresponds 
#to "BE REMOVED FROM THE UNITED STATES"
merged_case = merged_case.loc[~(merged_case.nat=="XX")]
merged_case.count()

idncase               531050
idnproceeding         531050
idnProceedingAppln    531050
Appl_Code             531050
Appl_Recd_Date        531048
Appl_Dec              531050
numAppsPerProc        531050
dec                   531050
osc_date              531050
comp_date             531050
tracid                531050
nat                   531050
numProcPerCase        531050
dtype: int64

In [None]:
merged_case.to_csv('/home/emilyboeke/merged_full_asylum_master_app.csv',index=False)