# Cleaning_full_asylum.ipynb
#### This script cleans and merges relevant variables across datasets
####  it only considers proceedings/cases associated with full asylum applications. creates a dataset unique at the idncase level  (within proceeding, it picks the asylum application and within case, it picks the proceeding associated with the asylum application, prioritizing by date --most recent first)
#### Currently, it is doing cleaning and merging only for the baseline model.

In [1]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [2]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Clean court_appln.csv

relevant variables: idnProceeding, idnCase, Appl_Code

In [3]:
app = pd.read_csv(path + '/court_appln.csv', low_memory=False)

# descriptive stats
#app.count()
#app.describe()

In [4]:
# adding placeholder number to count how many applications were filed for idnProceeding
app['numAppsPerProc'] = 1
app['numAppsPerProc'] = app['numAppsPerProc'].astype('int64')

# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
app['numAppsPerProc'] = app.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')

# dropping all applications with empty decisions
app = app.dropna(subset=['Appl_Dec'])

#app.describe()

In [5]:
# making a new variable, dec, simplifying grant decisions to DENY, GRANT, or nan
app['dec']= np.nan
app.loc[app.Appl_Dec.isin(['G','F','N','L','C']),'dec']= 'GRANT'
app.loc[(app["Appl_Dec"] == 'D'),'dec'] = 'DENY'
app = app[app.dec.isin(['DENY','GRANT'])] # only include DENY or GRANT cases

#app.count()


In [6]:
# only keep applications of type ASYL. sort by date within idnproceeding
#sorting by date--if there are multiple applications with the same decision with the same case type, 
#take the most recent one.
app = app[app.Appl_Code.isin(['ASYL'])]

#sort multiple times because some need to be ascending and some descending
app = app.sort_values(['idnProceeding','Appl_Recd_Date'],ascending=[True,False])
#len(app)

In [7]:
app = app.rename(columns={"idnCase":"idncase", "idnProceeding":"idnproceeding"})

In [8]:
app.head(10)

Unnamed: 0,idnProceedingAppln,idnproceeding,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
40,41,75.0,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
42,43,85.0,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
48,49,103.0,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
51,52,111.0,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
55,56,136.0,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
57,58,139.0,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
59,60,145.0,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
60,61,147.0,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
61,62,149.0,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
62,63,159.0,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


In [9]:
# make unique--take the first application for each proceeding, when sorted in order dec (grant deny),
#case type(ASYL, ASYW, WCAT), date
app2 = app.groupby('idnproceeding', as_index=False).first()


In [10]:
app2.count()

idnproceeding         573565
idnProceedingAppln    573565
idncase               573565
Appl_Code             573565
Appl_Recd_Date        573563
Appl_Dec              573565
numAppsPerProc        573565
dec                   573565
dtype: int64

In [11]:
app2.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY


## Clean master.csv

Relevant variables: idncase, idnproceeding, osc_date, tracid, nat

In [12]:
# load in data
master = pd.read_csv(path + '/master.csv', low_memory=False)

In [13]:
#drop empty cases and proceedings
master = master.dropna(subset= ['idncase','idnproceeding'])
#master.describe()

In [14]:
# stuff on osc_date (date charges filed or NTA)
master = master.dropna(subset=['osc_date']) # dropping empty dates

master['osc_date'] = master['osc_date'].astype('str')
master = master[master['osc_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y') # change to date format 
#master.describe()
# delete NTA dates before 1984
master = master[master.osc_date.dt.year>1983]

In [15]:
#comp date (date proceeding completed)
master = master.dropna(subset=['comp_date']) # dropping empty dates

master['comp_date'] = master['comp_date'].astype('str')
master = master[master['comp_date'].apply(lambda x: len(x) == 9)] # delete dates invalid formats

master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y') # change to date format 

#drop comp date dates before 1985
master = master[master.comp_date.dt.year>1984]


In [16]:
master.head(10)

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
11,2046920.0,MX,RMV,,3200048,CHI,CHD,O,X,,2004-08-06,11AUG2004,2004-08-11,,RDV,31.0
12,2046921.0,MX,RMV,,3200049,CHI,CHD,O,X,,2004-08-06,10AUG2004,2004-08-11,,RDV,31.0
13,2046922.0,MX,RMV,,3200050,CHI,CHD,O,X,,2004-08-09,19AUG2004,2004-08-19,,JLG,29.0
14,2046923.0,PL,RMV,,3200051,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-25,1.0,CC,27.0
15,2046923.0,PL,RMV,,3525150,CHI,CHD,,,T,2004-08-09,30MAR2005,2005-04-13,1.0,GPK,30.0
16,2046923.0,PL,RMV,,3538044,CHI,CHI,O,R,,2004-08-09,13APR2005,2007-06-04,1.0,CC,27.0
17,2046924.0,MX,RMV,,3200052,CHI,CHD,O,X,,2004-08-09,13AUG2004,2004-08-13,,RDV,31.0
18,2046925.0,MX,RMV,,3200053,CHI,CHD,O,X,,2004-08-10,19AUG2004,2004-08-19,,JLG,29.0
19,2046926.0,MX,RMV,,3200054,CHI,CHD,O,X,,2004-08-10,16AUG2004,2004-08-30,,CMZ,32.0
20,2046927.0,MX,RMV,,3200055,CHI,CHD,O,X,,2004-08-12,19AUG2004,2004-08-19,,JLG,29.0


In [17]:
# delete duplicates (since idnproceeding are unique, this shouldn't do anything)
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [18]:
master.count()

idncase             5669748
nat                 5667915
case_type           5669747
c_asy_type          1515478
idnproceeding       5669748
base_city_code      5669737
hearing_loc_code    5669646
dec_type            4534187
dec_code            4379061
other_comp          1290572
osc_date            5669748
input_date          5667795
comp_date           5669748
attorney_flag       2743993
ij_code             5637391
tracid              5360370
dtype: int64

In [19]:
master['idnproceeding'] = master['idnproceeding'].astype('float64')


In [20]:
#replace nan attorney flags with 0.
master.loc[pd.isnull(master.attorney_flag),'attorney_flag']=0


## Merge datasets

In [21]:
merged = pd.merge(app2, master, on=['idnproceeding','idncase'])

In [22]:
merged.count()

idnproceeding         567783
idnProceedingAppln    567783
idncase               567783
Appl_Code             567783
Appl_Recd_Date        567781
Appl_Dec              567783
numAppsPerProc        567783
dec                   567783
nat                   567635
case_type             567783
c_asy_type            566875
base_city_code        567781
hearing_loc_code      567780
dec_type              564927
dec_code              564912
other_comp              2874
osc_date              567783
input_date            567770
comp_date             567783
attorney_flag         567783
ij_code               566992
tracid                553505
dtype: int64

In [23]:
merged.head(10)

Unnamed: 0,idnproceeding,idnProceedingAppln,idncase,Appl_Code,Appl_Recd_Date,Appl_Dec,numAppsPerProc,dec,nat,case_type,...,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
0,75.0,41,3328085.0,ASYL,1994-12-20 00:00:00,D,3.0,DENY,HO,DEP,...,PIS,O,V,,1994-11-03,10NOV1994,1995-03-10,1.0,CAL,
1,85.0,43,3328111.0,ASYL,1995-04-02 00:00:00,D,3.0,DENY,HO,DEP,...,HOU,O,V,,1994-11-04,14DEC1994,1997-06-16,1.0,WKZ,71.0
2,103.0,49,3328153.0,ASYL,1995-05-05 00:00:00,G,2.0,GRANT,GT,DEP,...,NYC,W,D,,1994-11-05,20DEC1994,1995-08-08,1.0,JSC,139.0
3,111.0,52,3328175.0,ASYL,1995-03-31 00:00:00,D,3.0,DENY,ES,DEP,...,HOU,O,V,,1994-11-05,06DEC1994,1995-08-15,1.0,CMR,70.0
4,136.0,56,3327844.0,ASYL,1995-03-06 00:00:00,D,3.0,DENY,HO,DEP,...,HLG,O,D,,1995-02-06,06MAR1995,1995-04-06,1.0,JZ,50.0
5,139.0,58,3327852.0,ASYL,1996-05-14 00:00:00,G,2.0,GRANT,CU,DEP,...,MIA,O,R,,1995-02-06,05OCT1995,1996-05-14,1.0,RAJ,126.0
6,145.0,60,3327869.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,NU,DEP,...,MIA,O,V,,1995-02-10,23MAY1995,1996-01-04,1.0,WKZ,71.0
7,147.0,61,3327877.0,ASYL,1995-11-29 00:00:00,D,3.0,DENY,NU,DEP,...,MIA,O,V,,1995-02-10,12APR1995,1996-01-04,1.0,WKZ,71.0
8,149.0,62,3327884.0,ASYL,1995-04-14 00:00:00,D,3.0,DENY,NU,DEP,...,PIS,O,V,,1995-02-12,22FEB1995,1995-06-23,0.0,MB,61.0
9,159.0,63,3327927.0,ASYL,1995-07-25 00:00:00,D,3.0,DENY,NU,DEP,...,HOU,O,V,,1995-02-17,15MAR1995,1995-09-05,1.0,CMR,70.0


In [25]:
#drop nan tracids
merged = merged.dropna(subset=['tracid'])
merged.count()


idnproceeding         553505
idnProceedingAppln    553505
idncase               553505
Appl_Code             553505
Appl_Recd_Date        553503
Appl_Dec              553505
numAppsPerProc        553505
dec                   553505
nat                   553366
case_type             553505
c_asy_type            552606
base_city_code        553505
hearing_loc_code      553504
dec_type              550995
dec_code              550981
other_comp              2527
osc_date              553505
input_date            553492
comp_date             553505
attorney_flag         553505
ij_code               553505
tracid                553505
dtype: int64

In [26]:
#drop all cases where judge has fewer than 100 cases--same as in gambler's fallacy paper
tracid_100 = merged.groupby('tracid').idnproceeding.count()>=100 #bool indicating whether judge has at least 100 cases
tracid_100 = tracid_100.index.values[tracid_100]#indices of judges with at least 100 cases
merged2 = merged.loc[merged.tracid.isin(tracid_100)]
#merged2.count()

In [27]:
# adding additional feature based on how many full asylum proceedings have been filed for the same (idnCase) 
merged2['numProcPerCase'] = 1
merged2['numProcPerCase'] = merged2['numProcPerCase'].astype('int64')
merged2['numProcPerCase'] = merged2.groupby(['idncase'])['numProcPerCase'].transform('count')

#make unique at idncase level, sorting with the same logic as used to sort applications
merged_case = merged2.sort_values(['idncase','Appl_Recd_Date'],ascending=[True,False])
merged_case = merged_case.groupby('idncase',as_index=False ).first()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [28]:
merged_case.count()

idncase               528426
idnproceeding         528426
idnProceedingAppln    528426
Appl_Code             528426
Appl_Recd_Date        528424
Appl_Dec              528426
numAppsPerProc        528426
dec                   528426
nat                   528291
case_type             528426
c_asy_type            527538
base_city_code        528426
hearing_loc_code      528425
dec_type              526210
dec_code              526234
other_comp              2522
osc_date              528426
input_date            528414
comp_date             528426
attorney_flag         528426
ij_code               528426
tracid                528426
numProcPerCase        528426
dtype: int64

In [29]:
#get rid of merged_cases where other_comp is not null. other_comp indicates that the proceeding ended for a reason other than 
#a judge's decision, suggesting no decision was actually made. this is less than 1% of cases once we have already filtered
#out applications where the decision is not grant or deny and matched them to proceedings.
merged_case = merged_case[pd.isnull(merged_case.other_comp)]

#get rid of cases that don't have c_asy type (less than 1% of cases)
merged_case = merged_case[~pd.isnull(merged_case.c_asy_type)] 

#change values of c_asy_type to be more clear
merged_case.loc[merged_case.c_asy_type=='I','c_asy_type'] = 'aff'
merged_case.loc[merged_case.c_asy_type=='E','c_asy_type'] = 'def'

In [30]:
#drop variables that definitely won't be used as features (or won't be used to track where the data came from)
merged_case = merged_case.drop(['Appl_Dec','Appl_Code','Appl_Recd_Date','dec_type','other_comp','input_date','ij_code','dec_code'],axis=1)


In [31]:
#change ?? to unknwon for  159 cases with unknown nationalities
merged_case.loc[(merged_case.nat=='??'),'nat'] = 'unknown'

#mark na nats as unknown
merged_case.loc[pd.isnull(merged_case.nat),'nat'] = 'unknown'
#load nationality lookup table
nat_lut =  pd.read_csv(path+ '/tblLookupNationality.csv',header=None)

#mark 4 observations where the nationality code is not in the lookup table as unknown
merged_case.loc[~merged_case.nat.isin(nat_lut[1]),'nat'] = 'unknown'

#mark as unknown 2 observations with nationality code XX whic the LUT says corresponds 
#to "BE REMOVED FROM THE UNITED STATES"
merged_case.loc[(merged_case.nat=="XX"),'nat'] = 'unknown'

#examine counts for different nationalities:
nat_numbers = merged_case.groupby('nat',as_index=False).idncase.count().sort_values('idncase')

# some nationalities have only 1 or 2 observations. drop any with less than 10 observations.
nat_10 = nat_numbers.loc[nat_numbers['idncase']>9,'nat']
merged_case = merged_case[merged_case.nat.isin(nat_10)]

In [32]:
#remove cities with less than 10 obs (only removes one case)
city_numbers = merged_case.groupby('base_city_code',as_index=False).idncase.count().sort_values('idncase')
cities_10 = city_numbers.loc[city_numbers['idncase']>9,'base_city_code']

merged_case = merged_case[merged_case.base_city_code.isin(cities_10)]


In [33]:
#hearing loc--is this "court"? some of them are prisons/detention centers/airports. many "courts" have fewer than 10 obs
#drop thewse (less than 1% of proceedings)

court_numbers = merged_case.groupby('hearing_loc_code',as_index=False).idncase.count().sort_values('idncase')

courts_10 = court_numbers.loc[court_numbers['idncase']>9,'hearing_loc_code']
merged_case = merged_case[merged_case.hearing_loc_code.isin(courts_10)]


In [34]:
merged_case.groupby('dec').count()

Unnamed: 0_level_0,idncase,idnproceeding,idnProceedingAppln,numAppsPerProc,nat,case_type,c_asy_type,base_city_code,hearing_loc_code,osc_date,comp_date,attorney_flag,tracid,numProcPerCase
dec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
DENY,329289,329289,329289,329289,329289,329289,329289,329289,329289,329289,329289,329289,329289,329289
GRANT,195387,195387,195387,195387,195387,195387,195387,195387,195387,195387,195387,195387,195387,195387


In [35]:
#save data
merged_case.to_csv('/home/emilyboeke/merged_full_asylum_master_app.csv',index=False)