# Cleaning_any_asylum.ipynb
#### This script cleans and merges relevant variables across datasets
#### considers a grant on any asylum case type (full, witholding, wcat) a "grant" decision on the case. 
#### Currently, it is doing cleaning and merging only for the baseline model.

In [1]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [2]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [3]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [4]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()
app.count()

idnProceedingAppln    4232861
idnProceeding         4232861
idnCase               4232857
Appl_Code             4232859
Appl_Recd_Date        4232826
Appl_Dec              4232861
numAppsPerProc        4232857
dtype: int64

In [5]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[app.Appl_Dec.isin(['G','F','N','L','C']),'dec']= 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

app.count()


idnProceedingAppln    2374977
idnProceeding         2374977
idnCase               2374977
Appl_Code             2374976
Appl_Recd_Date        2374951
Appl_Dec              2374977
numAppsPerProc        2374977
dec                   2374977
dtype: int64

In [6]:
# only keep applications of type ASYL, ASYW, WCAT. sort by Grant, then deny, then case type in order (ASYL, ASYW, WCAT)
#, then date within idnproceeding
#sorting by date--if there are multiple applications with the same decision with the same case type, 
#take the most recent one.
app = app[app.Appl_Code.isin(['ASYL','ASYW', 'WCAT'])]

#sort multiple times because some need to be ascending and some descending
app = app.sort_values(['idnProceeding','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])


In [7]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [8]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
836171,837950,75.0,3328085.0,ASYW,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
836173,837952,85.0,3328111.0,ASYW,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
836186,837965,136.0,3327844.0,ASYW,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
836188,837967,139.0,3327852.0,ASYW,1996-05-14 00:00:00,G,2.0,GRANT


In [9]:
# make unique--take the first application for each proceeding, when sorted in order dec (grant deny),
#case type(ASYL, ASYW, WCAT), date
app2 = app.groupby('idnproceeding', as_index=False).first()


In [10]:
app2.count()

idnproceeding         614388
idnProceedingAppln    614388
idncase               614388
Appl_Code             614388
Appl_Recd_Date        614387
Appl_Dec              614388
numAppsPerProc        614388
dec                   614388
dtype: int64

In [11]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [12]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)
#master.describe() # summary stats

In [13]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [14]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()

In [None]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

In [None]:
master.head(10)

In [None]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [None]:
# define master2, which only has variables of interest
master2 = master[['idncase','idnproceeding', 'osc_date', 'comp_date','tracid', 'nat']]

In [None]:
master2.head(10)

In [None]:
master2.count()

In [None]:
master2['idnproceeding'] = master2['idnproceeding'].astype('float64')


## Merge datasets

In [None]:
merged = pd.merge(app2, master2, on=['idnproceeding','idncase'])

In [None]:
merged.count()

In [None]:
merged.head(10)

In [None]:
#drop nan tracids and nat
merged = merged.dropna(subset=['tracid','nat'])
merged.count()


In [None]:
 merged.groupby('tracid').count()

In [None]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
merged2.count()

In [None]:
#there are 6377 osc dates before 1985, but I thought 1985 was supposed to be earliest year. 
merged2[merged2.osc_date.dt.year<1985]
#when osc year is pre 1985, it is often many years before the appl_recd_date. some may be error, but some may 
#actually have occurred well before the court date?


In [None]:
#look at comp date--626 dates before 1985. drop these.
len(merged2[merged2.comp_date.dt.year<1985])
merged2 = merged2[merged2.comp_date.dt.year>1984]
merged2.count()

In [None]:
merged2.groupby('dec').count()

In [None]:
# adding additional feature based on how many asylum proceedings have been filed for the same (idnCase) 
merged2['numProcPerCase'] = 1
merged2['numProcPerCase'] = merged2['numProcPerCase'].astype('int64')
merged2['numProcPerCase'] = merged2.groupby(['idncase'])['numProcPerCase'].transform('count')

#make unique at idncase level, sorting with the same logic as used to sort applications
#counting case as a grant if ANY proceeding was grant
merged_case = merged2.sort_values(['idncase','dec','Appl_Code','Appl_Recd_Date'],ascending=[True,False,True,False])
merged_case = merged_case.groupby('idncase',as_index=False ).first()

In [None]:
merged_case.groupby('dec').count()

In [None]:
unique_nat = merged_case.nat.unique()
unique_nat

In [None]:
#drop 159 cases with unknown nationalities
merged_case = merged_case.loc[~(merged_case.nat=='??')]

In [None]:
#load nationality lookup table
nat_lut =  pd.read_csv(path+ '/tblLookupNationality.csv',header=None)

#drop 4 observations where the nationality code is not in the lookup table
merged_case = merged_case.loc[merged_case.nat.isin(nat_lut[1])]

#drop 2 observations with nationality code XX whic the LUT says corresponds 
#to "BE REMOVED FROM THE UNITED STATES"
merged_case = merged_case.loc[~(merged_case.nat=="XX")]
merged_case.count()

In [None]:
#save data
merged_case.to_csv('merged_any_master_app.csv',index=False)