In [1]:
import pandas as pd
import numpy as np
#from sklearn.preprocessing import CategoricalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
file = pd.read_csv('../merged_any_master_schedule.csv')
test_cases = pd.read_csv('../test_cases_any_asylum_full_model.csv', header=None)

In [3]:
test_cases = test_cases.rename(columns={0:'num'})

In [4]:
train = file[~file.idncase.isin(test_cases.num)]

In [None]:
test = file[file.idncase.isin(test_cases.num)]

In [None]:
class Cleaning(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop(columns=['idncase', 'idnproceeding', 'adj_date', 'adj_time_start2', 'adj_time_stop2', 'osc_date_y'])
        X.loc[(X["dec"] == 'DENY'),'dec'] = 0
        X.loc[(X["dec"] == 'GRANT'),'dec'] = 1
        X['comp_date'] = pd.to_datetime(X['comp_date'],infer_datetime_format = True)
        startdate = np.datetime64('1984-07-11')
        X['comp_date_in_days'] = X['comp_date'].apply(lambda x: (x - startdate).days)
        X = X.drop(columns=['comp_date'], axis=1)
        return X
    

In [5]:
train = train.drop(columns=['idncase', 'idnproceeding', 'adj_date', 'adj_time_start2', 'adj_time_stop2', 'osc_date_y'])

In [6]:
train.loc[(train["dec"] == 'DENY'),'dec'] = 0
train.loc[(train["dec"] == 'GRANT'),'dec'] = 1

In [7]:
train['comp_date'] = pd.to_datetime(train['comp_date'],infer_datetime_format = True)

In [8]:
train.sort_values('comp_date').head(2)

Unnamed: 0,numAppsPerProc,dec,nat,case_type,c_asy_type,comp_date,numProcPerCase,attorney_present,lang_hearing,hearing_city,...,base_city,base_state,sched_type,notice_desc,trac_id,durationHearing,adj_rsn_desc,numHearingsPerProc,durationFirstLastHearing,averageHearingDur
451,3.0,0,IR,DEP,def,1985-01-07,1,NO,ENGLISH,DETROIT,...,DETROIT,MI,UNKNOWN,NO ADDRESS MTN TO REOPEN ORDER OF THE IJ - SNA,230.0,60.0,UNKNOWN,1,0,60.0
2259,3.0,0,CO,DEP,def,1985-01-09,1,YES,SPANISH,BOSTON,...,BOSTON,MA,UNKNOWN,NO ADDRESS MTN TO REOPEN ORDER OF THE IJ - SNA,226.0,120.0,UNKNOWN,1,0,120.0
2258,3.0,0,CO,DEP,def,1985-01-10,1,YES,UNKNOWN LANGUAGE,BOSTON,...,BOSTON,MA,UNKNOWN,NO ADDRESS MTN TO REOPEN ORDER OF THE IJ - SNA,226.0,60.0,UNKNOWN,1,0,60.0
1924,3.0,0,IR,DEP,def,1985-01-23,1,YES,UNKNOWN LANGUAGE,KANSAS CITY,...,KANSAS CITY,MO,UNKNOWN,NO ADDRESS MTN TO REOPEN ORDER OF THE IJ - SNA,26.0,60.0,UNKNOWN,1,0,60.0
2572,2.0,0,ES,DEP,def,1985-01-28,1,YES,SPANISH,NEW YORK,...,NEW YORK,NY,UNKNOWN,NO ADDRESS MTN TO REOPEN ORDER OF THE IJ - SNA,331.0,60.0,UNKNOWN,4,184,60.0


In [10]:
train['comp_date_in_days'] = train['comp_date'].apply(lambda x: (x - startdate).days)

In [11]:
train = train.drop(columns=['comp_date'], axis=1)

In [12]:
y = train['dec']
X = train.drop(columns=['dec'], axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Generating Pipeline

In [55]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        num_frame = X.select_dtypes(include=[self.dtype])
        self.names = num_frame.columns
        return num_frame
    def get_feature_names(self):
        return self.names.tolist()

In [72]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        le = LabelEncoder()
        cat_frame = X.apply(le.fit_transform)
        self.names = le.classes_
        return cat_frame
    def get_feature_names(self):
        return self.names.tolist()

In [73]:

num_pipeline = Pipeline([
        ('selector', Selector(np.number)),
        ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
        ('selector', Selector('object')),
        ('labeler', CustomLabelEncoder()),
        ('encoder', OneHotEncoder()),
]) 

full_pipeline = FeatureUnion(transformer_list=[ 
        ('numerical', num_pipeline),  
        ('categorical', cat_pipeline)   
])  

In [74]:
X_tr= full_pipeline.fit_transform(X)

## Hyperparameter Grid Search 

In [17]:
param_grid = {'penalty': ['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [18]:
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_tr, y)
result = pd.DataFrame(grid_search.cv_results_)



In [19]:
print(result)

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0        0.942166         0.006413         0.643061          0.726648   0.001   
1        1.352957         0.006362         0.669493          0.765314   0.001   
2        3.571588         0.006252         0.680682          0.776185    0.01   
3        2.203953         0.007030         0.694461          0.786889    0.01   
4       17.446706         0.008017         0.702258          0.792867     0.1   
5        4.167243         0.007036         0.704239          0.794846     0.1   
6       40.928398         0.006685         0.704838          0.796323       1   
7        8.928708         0.007158         0.704770          0.796598       1   
8       39.456467         0.007051         0.703867          0.796869      10   
9       17.957599         0.006143         0.703997          0.796837      10   
10       6.656963         0.006687         0.702983          0.796854     100   
11      24.527359         0.

In [84]:
final_model = grid_search.best_estimator_
feature_weight = np.abs(grid_search.best_estimator_.coef_[0])

## Evaluate on Test Set

In [None]:
clean = Cleaning()
test = clean.transform(test)
y_test = test['dec']
X_test = test.drop(columns=['dec'], axis=1)
X_test_tr = full_pipeline.transform(X_test)

In [80]:
#num_features = num_pipeline.named_steps['selector'].get_feature_names()
#cat_features = cat_pipeline.named_steps['labeler'].get_feature_names()


67


In [None]:
final_pred = final_model.score(X_test_tr, y_test)