# Cleaning_any_asylum.ipynb
#### This script cleans and merges relevant variables across datasets
#### considers a grant on any asylum case type (full, witholding, wcat) a "grant" decision on the case. 
#### Currently, it is doing cleaning and merging only for the baseline model.

In [1]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [2]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [3]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [4]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()
app.count()

idnProceedingAppln    4232861
idnProceeding         4232861
idnCase               4232857
Appl_Code             4232859
Appl_Recd_Date        4232826
Appl_Dec              4232861
numAppsPerProc        4232857
dtype: int64

In [5]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[app.Appl_Dec.isin(['G','F','N','L','C']),'dec']= 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

app.count()


idnProceedingAppln    2374977
idnProceeding         2374977
idnCase               2374977
Appl_Code             2374976
Appl_Recd_Date        2374951
Appl_Dec              2374977
numAppsPerProc        2374977
dec                   2374977
dtype: int64

In [6]:
# only keep applications of type ASYL, ASYW, WCAT. sort by Grant, then deny, then case type in order (ASYL, ASYW, WCAT)
#, then date within idnproceeding
#sorting by date--if there are multiple applications with the same decision with the same case type, 
#take the most recent one.
app = app[app.Appl_Code.isin(['ASYL','ASYW', 'WCAT'])]

#sort multiple times because some need to be ascending and some descending
app = app.sort_values(['idnProceeding','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])


In [7]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [8]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
836171,837950,75.0,3328085.0,ASYW,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
836173,837952,85.0,3328111.0,ASYW,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
836186,837965,136.0,3327844.0,ASYW,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
836188,837967,139.0,3327852.0,ASYW,1996-05-14 00:00:00,G,2.0,GRANT


In [9]:
# make unique--take the first application for each proceeding, when sorted in order dec (grant deny),
#case type(ASYL, ASYW, WCAT), date
app2 = app.groupby('idnproceeding', as_index=False).first()


In [10]:
app2.count()

idnproceeding         614388
idnProceedingAppln    614388
idncase               614388
Appl_Code             614388
Appl_Recd_Date        614387
Appl_Dec              614388
numAppsPerProc        614388
dec                   614388
dtype: int64

In [11]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [12]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)
#master.describe() # summary stats

In [13]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [14]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()

# delete NTA dates before 1984
master = master[master.osc_date.dt.year>1983]

In [15]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

#drop comp date dates before 1985
master = master[master.comp_date.dt.year>1984]


In [16]:
master.head(10)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
11,2046920.0,MX,RMV,,3200048,CHI,CHD,O,X,,2004-08-06,11AUG2004,2004-08-11,,RDV,31.0
12,2046921.0,MX,RMV,,3200049,CHI,CHD,O,X,,2004-08-06,10AUG2004,2004-08-11,,RDV,31.0
13,2046922.0,MX,RMV,,3200050,CHI,CHD,O,X,,2004-08-09,19AUG2004,2004-08-19,,JLG,29.0
14,2046923.0,PL,RMV,,3200051,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-25,1.0,CC,27.0
15,2046923.0,PL,RMV,,3525150,CHI,CHD,,,T,2004-08-09,30MAR2005,2005-04-13,1.0,GPK,30.0
16,2046923.0,PL,RMV,,3538044,CHI,CHI,O,R,,2004-08-09,13APR2005,2007-06-04,1.0,CC,27.0
17,2046924.0,MX,RMV,,3200052,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-13,,RDV,31.0
18,2046925.0,MX,RMV,,3200053,CHI,CHD,O,X,,2004-08-10,19AUG2004,2004-08-19,,JLG,29.0
19,2046926.0,MX,RMV,,3200054,CHI,CHD,O,X,,2004-08-10,16AUG2004,2004-08-30,,CMZ,32.0
20,2046927.0,MX,RMV,,3200055,CHI,CHD,O,X,,2004-08-12,19AUG2004,2004-08-19,,JLG,29.0


In [17]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [18]:
# define master2, which only has variables of interest
master2 = master[['idncase','idnproceeding', 'osc_date', 'comp_date','tracid', 'nat']]

In [19]:
master2.head(10)

Unnamed: 0,idncase,idnproceeding,osc_date,comp_date,tracid,nat
11,2046920.0,3200048,2004-08-06,2004-08-11,31.0,MX
12,2046921.0,3200049,2004-08-06,2004-08-11,31.0,MX
13,2046922.0,3200050,2004-08-09,2004-08-19,29.0,MX
14,2046923.0,3200051,2004-08-09,2004-08-25,27.0,PL
15,2046923.0,3525150,2004-08-09,2005-04-13,30.0,PL
16,2046923.0,3538044,2004-08-09,2007-06-04,27.0,PL
17,2046924.0,3200052,2004-08-09,2004-08-13,31.0,MX
18,2046925.0,3200053,2004-08-10,2004-08-19,29.0,MX
19,2046926.0,3200054,2004-08-10,2004-08-30,32.0,MX
20,2046927.0,3200055,2004-08-12,2004-08-19,29.0,MX


In [20]:
master2.count()

idncase          5669748
idnproceeding    5669748
osc_date         5669748
comp_date        5669748
tracid           5360370
nat              5667915
dtype: int64

In [21]:
master2['idnproceeding'] = master2['idnproceeding'].astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Merge datasets

In [22]:
merged = pd.merge(app2, master2, on=['idnproceeding','idncase'])

In [23]:
merged.count()

idnproceeding         608560
idnProceedingAppln    608560
idncase               608560
Appl_Code             608560
Appl_Recd_Date        608559
Appl_Dec              608560
numAppsPerProc        608560
dec                   608560
osc_date              608560
comp_date             608560
tracid                594055
nat                   608403
dtype: int64

In [24]:
merged.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,tracid,nat
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY,1994-11-03,1995-03-10,,HO
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY,1994-11-04,1997-06-16,71.0,HO
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT,1994-11-05,1995-08-08,139.0,GT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY,1994-11-05,1995-08-15,70.0,ES
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY,1995-02-06,1995-04-06,50.0,HO
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT,1995-02-06,1996-05-14,126.0,CU
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,1995-02-10,1996-01-04,71.0,NU
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY,1995-02-12,1995-06-23,61.0,NU
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY,1995-02-17,1995-09-05,70.0,NU


In [25]:
#drop nan tracids and nat
merged = merged.dropna(subset=['tracid','nat'])
merged.count()


idnproceeding         593907
idnProceedingAppln    593907
idncase               593907
Appl_Code             593907
Appl_Recd_Date        593906
Appl_Dec              593907
numAppsPerProc        593907
dec                   593907
osc_date              593907
comp_date             593907
tracid                593907
nat                   593907
dtype: int64

In [26]:
 merged.groupby('tracid').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,nat
tracid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,4776,4776,4776,4776,4776,4776,4776,4776,4776,4776,4776
2.0,1244,1244,1244,1244,1244,1244,1244,1244,1244,1244,1244
3.0,4193,4193,4193,4193,4193,4193,4193,4193,4193,4193,4193
4.0,5731,5731,5731,5731,5731,5731,5731,5731,5731,5731,5731
5.0,878,878,878,878,878,878,878,878,878,878,878
6.0,624,624,624,624,624,624,624,624,624,624,624
7.0,5137,5137,5137,5137,5137,5137,5137,5137,5137,5137,5137
8.0,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503,3503
9.0,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312,2312
10.0,5595,5595,5595,5595,5595,5595,5595,5595,5595,5595,5595


In [27]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
merged2.count()

idnproceeding         592964
idnProceedingAppln    592964
idncase               592964
Appl_Code             592964
Appl_Recd_Date        592963
Appl_Dec              592964
numAppsPerProc        592964
dec                   592964
osc_date              592964
comp_date             592964
tracid                592964
nat                   592964
dtype: int64

In [28]:
#there are 6377 osc dates before 1985, but I thought 1985 was supposed to be earliest year. 
merged2[merged2.osc_date.dt.year<1985]
#when osc year is pre 1985, it is often many years before the appl_recd_date. some may be error, but some may 
#actually have occurred well before the court date?


Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,osc_date,comp_date,tracid,nat
12876,91492.0,27740,2048164.0,ASYL,1985-03-04 00:00:00,D,5.0,DENY,1984-03-13,1987-10-29,10.0,??
12908,93370.0,28338,2049203.0,ASYL,1988-03-31 00:00:00,D,3.0,DENY,1984-03-21,1988-12-01,202.0,ES
12916,94058.0,28353,2049102.0,ASYL,1988-08-01 00:00:00,D,2.0,DENY,1984-03-14,1988-09-28,166.0,CU
13052,99651.0,27838,2051255.0,ASYL,1990-03-23 00:00:00,D,3.0,DENY,1984-07-17,1990-09-28,197.0,PK
13098,101817.0,27781,2052487.0,ASYL,1986-05-21 00:00:00,D,2.0,DENY,1984-01-09,1986-08-29,251.0,CU
13186,106971.0,27974,2055405.0,ASYL,1990-05-15 00:00:00,D,2.0,DENY,1984-05-14,1991-01-10,251.0,CU
13191,107019.0,27986,2055618.0,ASYL,1988-10-25 00:00:00,D,4.0,DENY,1984-04-10,1989-06-27,1.0,CU
13192,107020.0,27987,2055618.0,ASYL,1985-02-28 00:00:00,D,4.0,DENY,1984-04-10,1987-05-28,1.0,CU
13265,110572.0,28123,2057635.0,ASYL,1988-12-13 00:00:00,D,4.0,DENY,1984-09-04,1989-05-03,226.0,KE
13266,110573.0,28124,2057635.0,ASYL,1986-04-10 00:00:00,D,4.0,DENY,1984-09-04,1986-10-02,226.0,KE


In [29]:
#look at comp date--626 dates before 1985. drop these.
len(merged2[merged2.comp_date.dt.year<1985])
merged2 = merged2[merged2.comp_date.dt.year>1984]
merged2.count()

idnproceeding         592964
idnProceedingAppln    592964
idncase               592964
Appl_Code             592964
Appl_Recd_Date        592963
Appl_Dec              592964
numAppsPerProc        592964
dec                   592964
osc_date              592964
comp_date             592964
tracid                592964
nat                   592964
dtype: int64

In [30]:
merged2.groupby('dec').count()

Unnamed: 0_level_0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DENY,362450,362450,362450,362450,362450,362450,362450,362450,362450,362450,362450
GRANT,230514,230514,230514,230514,230513,230514,230514,230514,230514,230514,230514


In [31]:
# adding additional feature based on how many asylum proceedings have been filed for the same (idnCase) 
merged2['numProcPerCase'] = 1
merged2['numProcPerCase'] = merged2['numProcPerCase'].astype('int64')
merged2['numProcPerCase'] = merged2.groupby(['idncase'])['numProcPerCase'].transform('count')

#make unique at idncase level, sorting with the same logic as used to sort applications
#counting case as a grant if ANY proceeding was grant
merged_case = merged2.sort_values(['idncase','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])
merged_case = merged_case.groupby('idncase',as_index=False ).first()

In [32]:
merged_case.groupby('dec').count()

Unnamed: 0_level_0,idncase,idnproceeding,idnProceedingAppln,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,osc_date,comp_date,tracid,nat,numProcPerCase
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DENY,338928,338928,338928,338928,338928,338928,338928,338928,338928,338928,338928,338928
GRANT,226084,226084,226084,226084,226083,226084,226084,226084,226084,226084,226084,226084


In [33]:
unique_nat = merged_case.nat.unique()
unique_nat

array(['CH', 'CM', 'AL', 'ET', 'CF', 'LE', 'GT', 'HO', 'IN', 'UE', 'CU',
       'LV', 'ES', 'UZ', 'LH', 'MX', 'AU', 'PO', '??', 'JM', 'RU', 'CO',
       'UK', 'RO', 'HU', 'IR', 'TS', 'ID', 'SY', 'YO', 'EC', 'PK', 'CZ',
       'NU', 'DR', 'JA', 'PE', 'BU', 'VE', 'GE', 'MO', 'LI', 'HK', 'TW',
       'HA', 'CS', 'NI', 'AR', 'RP', 'IZ', 'PL', 'YS', 'KE', 'BH', 'JO',
       'AG', 'ST', 'PM', 'EG', 'GR', 'BN', 'TU', 'AZ', 'FJ', 'BG', 'BY',
       'IV', 'BI', 'CI', 'CE', 'GH', 'KS', 'CX', 'SL', 'BM', 'GY', 'MV',
       'UR', 'GO', 'GV', 'CY', 'IS', 'AF', 'NN', 'BL', 'EI', 'CG', 'BW',
       'TD', 'CA', 'CQ', 'SO', 'VM', 'KU', 'MM', 'IT', 'SF', 'UG', 'AM',
       'YE', 'UY', 'BX', 'GA', 'LY', 'SK', 'BF', 'BR', 'PA', 'FG', 'CB',
       'SS', 'PS', 'CC', 'LA', 'BB', 'ZA', 'SZ', 'IY', 'AO', 'PU', 'TH',
       'SW', 'ZI', 'KV', 'SN', 'WS', 'MT', 'BA', 'MZ', 'SU', 'GJ', 'KN',
       'FR', 'MY', 'TZ', 'NG', 'SM', 'ER', 'TO', 'SA', 'SP', 'MA', 'CV',
       'MI', 'BC', 'RW', 'NS', 'GZ', 'SG', 'CT', 'N

In [34]:
#drop 159 cases with unknown nationalities
merged_case = merged_case.loc[~(merged_case.nat=='??')]

In [35]:
#load nationality lookup table
nat_lut =  pd.read_csv(path+ '/tblLookupNationality.csv',header=None)

#drop 4 observations where the nationality code is not in the lookup table
merged_case = merged_case.loc[merged_case.nat.isin(nat_lut[1])]

#drop 2 observations with nationality code XX whic the LUT says corresponds 
#to "BE REMOVED FROM THE UNITED STATES"
merged_case = merged_case.loc[~(merged_case.nat=="XX")]
merged_case.count()

idncase               564850
idnproceeding         564850
idnProceedingAppln    564850
Appl_Code             564850
Appl_Recd_Date        564849
Appl_Dec              564850
numAppsPerProc        564850
dec                   564850
osc_date              564850
comp_date             564850
tracid                564850
nat                   564850
numProcPerCase        564850
dtype: int64

In [36]:
#examine counts for different nationalities:
nat_numbers = merged_case.groupby('nat',as_index=False).idncase.count().sort_values('idncase')

# some nationalities have only 1 or 2 observations. drop any with less than 10 observations.
nat_10 = nat_numbers.loc[nat_numbers['idncase']>9,'nat']
merged_case = merged_case[merged_case.nat.isin(nat_10)]

In [37]:
#save data
merged_case.to_csv('/home/emilyboeke/merged_any_master_app.csv',index=False)