# Cleaning.ipynb
#### This script cleans and merges relevant variables across datasets
#### Currently, it is doing cleaning and merging only for the baseline model.

In [2]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [3]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [3]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [4]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()

In [7]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[((app["Appl_Dec"] == 'G') |( app["Appl_Dec"] == 'F' ) 
             | (app["Appl_Dec"] == 'N') | (app["Appl_Dec"] == 'L')
            | (app["Appl_Dec"] == 'C')),'dec'] = 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

#app.count()


In [27]:
# only keep applications of type ASYL, ASYW, WCAT. sort by Grant, then deny, then case type in order (ASYL, ASYW, WCAT)
#, then date within idnproceeding
#sorting by date--if there are multiple applications with the same decision with the same case type, 
#take the most recent one.
app = app[app.Appl_Code.isin(['ASYL','ASYW', 'WCAT'])]

#sort multiple times because some need to be ascending and some descending
app = app.sort_values(['idnProceeding','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])


In [28]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [29]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
836171,837950,75.0,3328085.0,ASYW,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
836173,837952,85.0,3328111.0,ASYW,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
836186,837965,136.0,3327844.0,ASYW,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
836188,837967,139.0,3327852.0,ASYW,1996-05-14 00:00:00,G,2.0,GRANT


In [30]:
# make unique--take the first application for each proceeding, when sorted in order dec (grant deny),
#case type(ASYL, ASYW, WCAT), date
app2 = app.groupby('idnproceeding', as_index=False).first()
#print(app_unique_idnp)
#app_unique_idnp.count()

In [32]:
app2.count()

idnproceeding         614388
idnProceedingAppln    614388
idncase               614388
Appl_Code             614388
Appl_Recd_Date        614387
Appl_Dec              614388
numAppsPerProc        614388
dec                   614388
dtype: int64

In [33]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [34]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)

In [13]:
# change variables to categorical for descriptive stats
#master['idncase'] = master['idncase'].astype('category')
#master['tracid'] = master['tracid'].astype('category')

#master.describe() # summary stats

In [35]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [36]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()

In [37]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

In [38]:
master.head(10)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
11,2046920.0,MX,RMV,,3200048,CHI,CHD,O,X,,2004-08-06,11AUG2004,2004-08-11,,RDV,31.0
12,2046921.0,MX,RMV,,3200049,CHI,CHD,O,X,,2004-08-06,10AUG2004,2004-08-11,,RDV,31.0
13,2046922.0,MX,RMV,,3200050,CHI,CHD,O,X,,2004-08-09,19AUG2004,2004-08-19,,JLG,29.0
14,2046923.0,PL,RMV,,3200051,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-25,1.0,CC,27.0
15,2046923.0,PL,RMV,,3525150,CHI,CHD,,,T,2004-08-09,30MAR2005,2005-04-13,1.0,GPK,30.0
16,2046923.0,PL,RMV,,3538044,CHI,CHI,O,R,,2004-08-09,13APR2005,2007-06-04,1.0,CC,27.0
17,2046924.0,MX,RMV,,3200052,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-13,,RDV,31.0
18,2046925.0,MX,RMV,,3200053,CHI,CHD,O,X,,2004-08-10,19AUG2004,2004-08-19,,JLG,29.0
19,2046926.0,MX,RMV,,3200054,CHI,CHD,O,X,,2004-08-10,16AUG2004,2004-08-30,,CMZ,32.0
20,2046927.0,MX,RMV,,3200055,CHI,CHD,O,X,,2004-08-12,19AUG2004,2004-08-19,,JLG,29.0


In [39]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [40]:
# define master2, which only has variables of interest
master2 = master[['idncase','idnproceeding', 'osc_date', 'comp_date','tracid', 'nat']]

In [41]:
master2.head(10)

Unnamed: 0,idncase,idnproceeding,osc_date,comp_date,tracid,nat
11,2046920.0,3200048,2004-08-06,2004-08-11,31.0,MX
12,2046921.0,3200049,2004-08-06,2004-08-11,31.0,MX
13,2046922.0,3200050,2004-08-09,2004-08-19,29.0,MX
14,2046923.0,3200051,2004-08-09,2004-08-25,27.0,PL
15,2046923.0,3525150,2004-08-09,2005-04-13,30.0,PL
16,2046923.0,3538044,2004-08-09,2007-06-04,27.0,PL
17,2046924.0,3200052,2004-08-09,2004-08-13,31.0,MX
18,2046925.0,3200053,2004-08-10,2004-08-19,29.0,MX
19,2046926.0,3200054,2004-08-10,2004-08-30,32.0,MX
20,2046927.0,3200055,2004-08-12,2004-08-19,29.0,MX


In [42]:
master2.count()

idncase          5716359
idnproceeding    5716359
osc_date         5716359
comp_date        5716359
tracid           5393477
nat              5714180
dtype: int64

In [43]:
master2['idnproceeding'] = master2['idnproceeding'].astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Merge datasets

In [44]:
merged = pd.merge(app2, master2, on=['idnproceeding','idncase'])

In [45]:
merged.count()

idnproceeding         613766
idnProceedingAppln    613766
idncase               613766
Appl_Code             613766
Appl_Recd_Date        613765
Appl_Dec              613766
numAppsPerProc        613766
dec                   613766
osc_date              613766
comp_date             613766
tracid                597699
nat                   613574
dtype: int64

In [46]:
merged.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,tracid,nat
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY,1994-11-03,1995-03-10,,HO
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY,1994-11-04,1997-06-16,71.0,HO
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT,1994-11-05,1995-08-08,139.0,GT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY,1994-11-05,1995-08-15,70.0,ES
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY,1995-02-06,1995-04-06,50.0,HO
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT,1995-02-06,1996-05-14,126.0,CU
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY,1995-02-12,1995-06-23,61.0,NU
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY,1995-02-17,1995-09-05,70.0,NU


## Save data

In [50]:
merged.to_csv('merged_master_app.csv')

In [47]:
#load data 
#merged = pd.read_csv('merged_master_app.csv')
#drop nan tracids and nat
merged = merged.dropna(subset=['tracid','nat'])
merged.count()


idnproceeding         597522
idnProceedingAppln    597522
idncase               597522
Appl_Code             597522
Appl_Recd_Date        597521
Appl_Dec              597522
numAppsPerProc        597522
dec                   597522
osc_date              597522
comp_date             597522
tracid                597522
nat                   597522
dtype: int64

In [48]:
 merged.groupby('tracid').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,nat
tracid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,4905,4905,4905,4905,4905,4905,4905,4905,4905,4905,4905
2.0,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247
3.0,4195,4195,4195,4195,4195,4195,4195,4195,4195,4195,4195
4.0,5732,5732,5732,5732,5732,5732,5732,5732,5732,5732,5732
5.0,878,878,878,878,878,878,878,878,878,878,878
6.0,624,624,624,624,624,624,624,624,624,624,624
7.0,5197,5197,5197,5197,5197,5197,5197,5197,5197,5197,5197
8.0,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503
9.0,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312
10.0,5751,5751,5751,5751,5751,5751,5751,5751,5751,5751,5751


In [49]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
merged2.count()

idnproceeding         596579
idnProceedingAppln    596579
idncase               596579
Appl_Code             596579
Appl_Recd_Date        596578
Appl_Dec              596579
numAppsPerProc        596579
dec                   596579
osc_date              596579
comp_date             596579
tracid                596579
nat                   596579
dtype: int64

In [50]:
#merged2['osc_date'] = pd.to_datetime(merged2['osc_date']) # change to date format 

#there are 6377 osc dates before 1985, but I thought 1985 was supposed to be earliest year. 
merged2[merged2.osc_date.dt.year<1985]
#when osc year is pre 1985, it is often many years before the appl_recd_date. some may be error, but some may 
#actually have occurred well before the court date?


Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,tracid,nat
12876,91492.0,27740,2048164.0,ASYL,1985-03-04 00:00:00,D,5.0,DENY,1984-03-13,1987-10-29,10.0,??
12879,91808.0,2477961,2048117.0,WCAT,2003-08-14 11:14:00,D,3.0,DENY,1974-05-31,2005-12-23,108.0,GE
12909,93318.0,28337,2048737.0,ASYL,1983-06-09 00:00:00,D,3.0,DENY,1983-05-25,1986-08-26,305.0,NU
12910,93370.0,28338,2049203.0,ASYL,1988-03-31 00:00:00,D,3.0,DENY,1984-03-21,1988-12-01,202.0,ES
12919,94058.0,28353,2049102.0,ASYL,1988-08-01 00:00:00,D,2.0,DENY,1984-03-14,1988-09-28,166.0,CU
12921,94207.0,28358,2049493.0,ASYL,1985-07-03 00:00:00,D,2.0,DENY,1974-03-24,1987-04-06,1.0,CU
12923,94262.0,28361,2048654.0,ASYL,1985-03-27 00:00:00,D,2.0,DENY,1976-07-20,1985-09-25,1.0,CU
12945,95949.0,27422,2049605.0,ASYL,1981-01-06 00:00:00,G,4.0,GRANT,1973-01-16,1987-04-03,39.0,NU
13006,97177.0,27588,2050079.0,ASYL,1996-06-19 00:00:00,D,5.0,DENY,1971-04-14,1996-10-08,250.0,CU
13049,98428.0,27648,2051278.0,ASYL,1987-03-30 00:00:00,G,2.0,GRANT,1960-10-25,1987-08-03,153.0,SY


In [51]:
#look at comp date--626 dates before 1985. drop these.
len(merged2[merged2.comp_date.dt.year<1985])
merged2 = merged2[merged2.comp_date.dt.year>1984]
merged2.count()

idnproceeding         595953
idnProceedingAppln    595953
idncase               595953
Appl_Code             595953
Appl_Recd_Date        595952
Appl_Dec              595953
numAppsPerProc        595953
dec                   595953
osc_date              595953
comp_date             595953
tracid                595953
nat                   595953
dtype: int64

In [53]:
merged2.groupby('dec').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DENY,364803,364803,364803,364803,364803,364803,364803,364803,364803,364803,364803
GRANT,231150,231150,231150,231150,231149,231150,231150,231150,231150,231150,231150


In [60]:
# adding additional feature based on how many proceedings have been filed for the same (idnCase) 
merged2['numProcPerCase'] = 1
merged2['numProcPerCase'] = merged2['numProcPerCase'].astype('int64')
merged2['numProcPerCase'] = merged2.groupby(['idncase'])['numProcPerCase'].transform('count')

#make unique at idncase level, sorting with the same logic as used to sort applications
#counting case as a grant if ANY proceeding was grant
merged_case = merged2.sort_values(['idncase','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])
merged_case = merged_case.groupby('idncase',as_index=False ).first()

In [66]:
merged_case.groupby('dec').count()

Unnamed: 0_level_0,idncase,idnproceeding,idnProceedingAppln,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat,numProcPerCase
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DENY,341232,341232,341232,341232,341232,341232,341232,341232,341232,341232,341232,341232
GRANT,226717,226717,226717,226717,226716,226717,226717,226717,226717,226717,226717,226717


In [6]:
unique_nat = merged_case.nat.unique()
unique_nat

array(['CH', 'CM', 'AL', 'ET', 'CF', 'LE', 'GT', 'HO', 'IN', 'UE', 'CU',
       'LV', 'ES', 'UZ', 'LH', 'MX', 'AU', 'GE', 'PO', '??', 'JM', 'RU',
       'CO', 'UK', 'RO', 'NU', 'HU', 'IR', 'TS', 'ID', 'SY', 'YO', 'EC',
       'PK', 'CI', 'CZ', 'IZ', 'DR', 'JA', 'PE', 'BU', 'VE', 'MO', 'LI',
       'HK', 'TW', 'HA', 'CS', 'NI', 'AR', 'RP', 'PL', 'YS', 'KE', 'BH',
       'JO', 'AG', 'ST', 'PM', 'EG', 'GR', 'BN', 'BG', 'TU', 'UR', 'AZ',
       'SP', 'FJ', 'BY', 'IV', 'BI', 'CE', 'GH', 'KS', 'CX', 'SL', 'BM',
       'GY', 'BA', 'MV', 'GO', 'GV', 'CY', 'IS', 'AF', 'NN', 'BL', 'EI',
       'CG', 'BW', 'TD', 'CA', 'CQ', 'SO', 'VM', 'KU', 'MM', 'DM', 'IT',
       'SF', 'TK', 'UG', 'AM', 'YE', 'UY', 'BX', 'GA', 'LY', 'SK', 'BF',
       'BR', 'PA', 'FG', 'CB', 'SS', 'PS', 'CC', 'LA', 'BB', 'ZA', 'SZ',
       'IY', 'AO', 'PU', 'TH', 'SW', 'NG', 'ZI', 'KV', 'SN', 'WS', 'MT',
       'MZ', 'SU', 'GJ', 'KN', 'FA', 'FR', 'MY', 'TZ', 'SM', 'ER', 'TO',
       'SA', 'MA', 'CV', 'MI', 'BC', 'RW', 'NS', 'G

In [11]:
#drop 159 cases with unknown nationalities
merged_case = merged_case.loc[~(merged_case.nat=='??')]

In [30]:
#load nationality lookup table
nat_lut =  pd.read_csv(path+ '/tblLookupNationality.csv',header=None)

#drop 4 observations where the nationality code is not in the lookup table
merged_case = merged_case.loc[merged_case.nat.isin(nat_lut[1])]

#drop 2 observations with nationality code XX whic the LUT says corresponds 
#to "BE REMOVED FROM THE UNITED STATES"
merged_case = merged_case.loc[~(merged_case.nat=="XX")]
merged_case.count()

idncase               567784
idnproceeding         567784
idnProceedingAppln    567784
Appl_Code             567784
Appl_Recd_Date        567783
Appl_Dec              567784
numAppsPerProc        567784
dec                   567784
osc_date              567784
comp_date             567784
tracid                567784
nat                   567784
numProcPerCase        567784
dtype: int64

In [31]:
merged_case.to_csv('merged_any_master_app.csv',index=False)