In [148]:
import pandas as pd
import numpy as np
pd.set_option('precision', 5)

In [141]:
path = '/data/Dropbox/Data/Asylum_Courts/raw'

## Cleaning court_appln.csv Dataset

In [26]:
court = pd.read_csv(path + '/court_appln.csv', low_memory=False)
# changing these from float data types to categorical so summary stats work
court['idnProceeding'] = court['idnProceeding'].astype('category')
court['idnCase'] = court['idnCase'].astype('category')
court['idnProceedingAppln'] = court['idnProceedingAppln'].astype('category')
# this gives summary stats of the original court.csv
court.count()

idnProceedingAppln    4559071
idnProceeding         4559069
idnCase               4559067
Appl_Code             4559069
Appl_Recd_Date        4559036
Appl_Dec              4232861
dtype: int64

In [27]:
# this gives summary stats of the original court.csv
court.describe()

Unnamed: 0,idnProceedingAppln,idnProceeding,idnCase,Appl_Code,Appl_Recd_Date,Appl_Dec
count,4559071,4559069.0,4559067.0,4559069,4559036,4232861
unique,4559071,2219694.0,1942504.0,147,252761,17
top,4679486,1177202.0,4611524.0,ASYL,1997-03-31 00:00:00,D
freq,1,14.0,45.0,1294601,5030,1229245


In [28]:
# adding placeholder number to count how many applications were filed for idnProceeding
court['numAppsPerProc'] = 1
court['numAppsPerProc'] = court['numAppsPerProc'].astype('int64')
# adding additional feature based on how many applications have been filed for the same (idnCase, idnProceeding) pair
court['numAppsPerProc'] = court.groupby(['idnCase', 'idnProceeding'])['numAppsPerProc'].transform('count')
# dropping all applications with empty decisions
court = court.dropna(subset=['Appl_Dec'])

In [29]:
court.describe()

Unnamed: 0,numAppsPerProc
count,4232857.0
mean,2.581931
std,1.076856
min,1.0
25%,2.0
50%,3.0
75%,3.0
max,14.0


In [30]:
# filtering for applications that have a Deny or Grant decision
court = court[court.Appl_Dec.isin(['D','G'])]
court.count()

idnProceedingAppln    2295485
idnProceeding         2295485
idnCase               2295485
Appl_Code             2295484
Appl_Recd_Date        2295459
Appl_Dec              2295485
numAppsPerProc        2295485
dtype: int64

In [35]:
court = court[court.Appl_Code.isin(['ASYL','ASYW', 'WCAT'])]
court.count()

idnProceedingAppln    1151913
idnProceeding         1151913
idnCase               1151913
Appl_Code             1151913
Appl_Recd_Date        1151911
Appl_Dec              1151913
numAppsPerProc        1151913
dtype: int64

In [158]:
court.count()

idnProceedingAppln    1151913
idnProceeding         1151913
idnCase               1151913
Appl_Code             1151913
Appl_Recd_Date        1151911
Appl_Dec              1151913
numAppsPerProc        1151913
dtype: int64

In [None]:
c

## Cleaning master.csv Dataset

In [112]:
master = pd.read_csv(path + '/master.csv', low_memory=False)

In [113]:
master.dtypes

idncase             float64
nat                  object
case_type            object
c_asy_type           object
idnproceeding        object
base_city_code       object
hearing_loc_code     object
dec_type             object
dec_code             object
other_comp           object
osc_date             object
input_date           object
comp_date            object
attorney_flag       float64
ij_code              object
tracid              float64
dtype: object

In [114]:
master['idncase'] = master['idncase'].astype('category')
master['attorney_flag'] = master['attorney_flag'].astype('category')
master['tracid'] = master['tracid'].astype('category')

# this gives summary stats of the original master.csv
master.describe()

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
count,6084423.0,6067662,6084422,1635071,6084437,6084413,6082921,4564888,4409689,1339567,6051084,6054693,5749361,3039846.0,6044839,5742523.0
unique,4729150.0,267,23,8,6084437,61,330,12,19,10,14510,13032,10372,1.0,549,401.0
top,3102179.0,MX,RMV,I,4468438,LOS,LOS,O,X,C,04MAY1989,31MAR1997,12JAN1991,1.0,DA,60.0
freq,16.0,2147177,4354277,877426,1,573840,495905,2741447,2105793,581757,2058,6968,13219,3039846.0,79810,79812.0


In [115]:
# dropping empty dates
master = master.dropna(subset=['osc_date', 'input_date', 'comp_date'])

In [116]:
master['osc_date'] = master['osc_date'].astype('str')
master['input_date'] = master['input_date'].astype('str')
master['comp_date'] = master['comp_date'].astype('str')

In [117]:
master.describe()

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
count,5714382.0,5712217,5714381,1525404,5714391,5714380,5714291,4564532,4409568,1304708,5714391,5714391,5714391,2771903.0,5681911,5391579.0
unique,4513708.0,266,21,5,5714391,54,325,10,15,10,14505,12991,10359,1.0,545,401.0
top,3102179.0,MX,RMV,I,4468438,LOS,LOS,O,X,C,04MAY1989,31MAR1997,12JAN1991,1.0,DA,60.0
freq,16.0,2016048,3990682,803890,1,522782,445958,2741172,2105725,581699,2058,6968,13219,2771903.0,78025,78027.0


In [135]:
master = master[master['osc_date'].apply(lambda x: len(x) == 9)]
master = master[master['input_date'].apply(lambda x: len(x) == 9)]
master = master[master['comp_date'].apply(lambda x: len(x) == 9)]

In [136]:
master.describe()

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
count,5714382.0,5712217,5714381,1525404,5714391,5714380,5714291,4564532,4409568,1304708,5714391,5714391,5714391,2771903.0,5681911,5391579.0
unique,4513708.0,266,21,5,5714391,54,325,10,15,10,14505,12991,10359,1.0,545,401.0
top,3102179.0,MX,RMV,I,4468438,LOS,LOS,O,X,C,04MAY1989,31MAR1997,12JAN1991,1.0,DA,60.0
freq,16.0,2016048,3990682,803890,1,522782,445958,2741172,2105725,581699,2058,6968,13219,2771903.0,78025,78027.0


In [139]:
master['osc_date'] = pd.to_datetime(master['osc_date'], format='%d%b%Y')
master['input_date'] = pd.to_datetime(master['input_date'], format='%d%b%Y')
master['comp_date'] = pd.to_datetime(master['comp_date'], format='%d%b%Y')

In [143]:
# this includes datetime columns and their earliest and latest occurrence in the dataset 
master.describe(include='all')

Unnamed: 0,idncase,nat,case_type,c_asy_type,idnproceeding,base_city_code,hearing_loc_code,dec_type,dec_code,other_comp,osc_date,input_date,comp_date,attorney_flag,ij_code,tracid
count,5714382.0,5712217,5714381,1525404,5714391.0,5714380,5714291,4564532,4409568,1304708,5714391,5714391,5714391,2771903.0,5681911,5391579.0
unique,4513708.0,266,21,5,5714391.0,54,325,10,15,10,14505,12991,10359,1.0,545,401.0
top,3102179.0,MX,RMV,I,4468438.0,LOS,LOS,O,X,C,1989-05-04 00:00:00,1997-03-31 00:00:00,1991-01-12 00:00:00,1.0,DA,60.0
freq,16.0,2016048,3990682,803890,1.0,522782,445958,2741172,2105725,581699,2058,6968,13219,2771903.0,78025,78027.0
first,,,,,,,,,,,1900-01-01 00:00:00,1900-01-01 00:00:00,1951-03-23 00:00:00,,,
last,,,,,,,,,,,2013-05-29 00:00:00,2013-05-31 00:00:00,2013-05-31 00:00:00,,,


In [144]:
master = master.drop_duplicates(subset=['idncase', 'idnproceeding'])

In [155]:
# keeping only affimrative and defensive cases
master = master[master.c_asy_type.isin(['E','I'])]

## Cleaning schedule.csv Dataset

In [157]:
sched = pd.read_csv(path + '/schedule.csv')

In [161]:
sched['idnschedule'] = sched['idnschedule'].astype('category')
sched['idncase'] = sched['idncase'].astype('category')
sched['idnproceeding'] = sched['idnproceeding'].astype('category')

In [162]:
# summary stats of original schedule.csv
sched.describe()

Unnamed: 0,idnschedule,idncase,idnproceeding,adj_medium,schedule_type,adj_date
count,15377519,15377502.0,15377519,8825704,15377519,15377519
unique,2748024,4675871.0,5973070,6,24,12736
top,1270261,5630045.0,4324164,P,--,31MAR2011
freq,11,245.0,215,7487967,9056997,5846


In [163]:
sched['adj_date'] = sched['adj_date'].astype('str')
sched = sched[sched['adj_date'].apply(lambda x: len(x) == 9)]

Unnamed: 0,idnschedule,idncase,idnproceeding,adj_medium,schedule_type,adj_date
count,15377519,15377502.0,15377519,8825704,15377519,15377519
unique,2748024,4675871.0,5973070,6,24,12736
top,1270261,5630045.0,4324164,P,--,31MAR2011
freq,11,245.0,215,7487967,9056997,5846


In [165]:
sched['adj_date'] = pd.to_datetime(sched['adj_date'], format='%d%b%Y')
sched.describe()

Unnamed: 0,idnschedule,idncase,idnproceeding,adj_medium,schedule_type,adj_date
count,15377519.0,15377502.0,15377519.0,8825704,15377519,15377519
unique,2748024.0,4675871.0,5973070.0,6,24,12736
top,1270261.0,5630045.0,4324164.0,P,--,2011-03-31 00:00:00
freq,11.0,245.0,215.0,7487967,9056997,5846
first,,,,,,1951-03-23 00:00:00
last,,,,,,2030-07-30 00:00:00


In [167]:
# need to investigate why there's a 2030 date....
r = sched[sched['adj_date'] == '2030-07-30']
print(r)

        idnschedule    idncase idnproceeding adj_medium schedule_type  \
3832862     2132155  6361860.0       5001252          p            MD   

          adj_date  
3832862 2030-07-30  
