In [46]:
import pandas as pd
import numpy as np
#from sklearn.preprocessing import CategoricalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier

## Creating Test/Train Sets

In [2]:
file = pd.read_csv('../merged_any_master_schedule.csv')
test_cases = pd.read_csv('../test_cases_any_asylum_full_model.csv', header=None)

In [3]:
test_cases = test_cases.rename(columns={0:'num'})

In [18]:
train = file[~file.idncase.isin(test_cases.num)]

In [21]:
test = file[file.idncase.isin(test_cases.num)]

In [6]:
class Cleaning(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop(columns=['idncase', 'idnproceeding', 'adj_date', 'adj_time_start2', 'adj_time_stop2', 'osc_date_y'])
        X.loc[(X["dec"] == 'DENY'),'dec'] = 0
        X.loc[(X["dec"] == 'GRANT'),'dec'] = 1
        X['comp_date'] = pd.to_datetime(X['comp_date'],infer_datetime_format = True)
        startdate = np.datetime64('1984-07-11')
        X['comp_date_in_days'] = X['comp_date'].apply(lambda x: (x - startdate).days)
        X = X.drop(columns=['comp_date'], axis=1)
        return X
    

In [23]:
clean = Cleaning()
train_data = clean.transform(train)

In [24]:
y_train = train_data['dec']
X_train = train_data.drop(columns=['dec'], axis=1)

## Generating Pipeline

In [26]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        num_frame = X.select_dtypes(include=[self.dtype])
        self.names = num_frame.columns
        return num_frame
    def get_feature_names(self):
        return self.names.tolist()

In [74]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        names = []
        for col in X.columns:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
            #cat_frame = X.apply(le.fit_transform)
            names.extend(le.classes_)
        self.names = names
        return X
    def get_feature_names(self):
        return self.names

In [28]:
num_pipeline = Pipeline([
        ('selector', Selector(np.number)),
        ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
        ('selector', Selector('object')),
        ('labeler', CustomLabelEncoder()),
        ('encoder', OneHotEncoder()),
]) 

full_pipeline = FeatureUnion(transformer_list=[ 
        ('numerical', num_pipeline),  
        ('categorical', cat_pipeline)   
])  

In [75]:
X_train_tr= full_pipeline.fit_transform(X_train)

## Hyperparameter Grid Search On Logistic Regression

In [30]:
param_grid = {'penalty': ['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [31]:
log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_tr, y_train)
result = pd.DataFrame(grid_search.cv_results_)



In [32]:
print(result)

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0        2.373631         0.013899         0.643052          0.726646   0.001   
1        2.424796         0.019857         0.669493          0.765314   0.001   
2       10.345317         0.032659         0.680687          0.776167    0.01   
3        4.903254         0.017303         0.694461          0.786889    0.01   
4       53.518398         0.022968         0.702331          0.792865     0.1   
5        9.248451         0.018657         0.704239          0.794846     0.1   
6      100.118109         0.015562         0.704828          0.796316       1   
7       21.231059         0.014013         0.704770          0.796598       1   
8       97.647666         0.013785         0.703872          0.796872      10   
9       41.358622         0.021417         0.703997          0.796837      10   
10      20.346499         0.018359         0.703046          0.796869     100   
11      51.879921         0.

In [33]:
final_model = grid_search.best_estimator_
feature_weight = np.abs(grid_search.best_estimator_.coef_[0])

In [38]:
final_model.feature_names

AttributeError: 'LogisticRegression' object has no attribute 'feature_names'

## Evaluate on Test Set

In [34]:
test_data = clean.transform(test)
y_test = test_data['dec']
X_test = test_data.drop(columns=['dec'], axis=1)
X_test_tr = full_pipeline.transform(X_test)

In [None]:
#num_features = num_pipeline.named_steps['selector'].get_feature_names()
#cat_features = cat_pipeline.named_steps['labeler'].get_feature_names()

In [37]:
print(final_model.score(X_test_tr, y_test))

0.696774692252691


## Hyperparameter Grid Search On Decision Tree

In [42]:
param_grid_dt = {"max_depth": [3, None],
        "max_features": randint(1, 10),
        "min_samples_leaf": randint(1, 10),
        "criterion": ["gini", "entropy"]}

In [45]:
dec_tree = DecisionTreeClassifier()
grid_search_dt = RandomizedSearchCV(dec_tree, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_dt.fit(X_train_tr, y_train)
result = pd.DataFrame(grid_search_dt.cv_results_)
print(result)

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       0.905043         0.198299         0.634790          0.690383   
1       0.450950         0.051113         0.520861          0.520876   
2       0.445638         0.057486         0.525204          0.540857   
3       0.754962         0.107842         0.576995          0.640315   
4       0.429582         0.068765         0.520272          0.521501   
5       0.446517         0.073850         0.522494          0.530022   
6       0.418423         0.056192         0.520634          0.521245   
7       0.587739         0.097633         0.630442          0.683123   
8       0.961293         0.121219         0.590372          0.688161   
9       0.392032         0.060565         0.526538          0.527558   

  param_criterion param_max_depth param_max_features param_min_samples_leaf  \
0         entropy            None                  9                      4   
1         entropy               3                



In [47]:
param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_features': [2, 4, 6, 8, 10],
    'bootstrap': [False, True],
    'max_depth':[1, 2, 3, 4, 5]
}

In [48]:
rf = RandomForestClassifier()
grid_search_rf = RandomizedSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train_tr, y_train)
result = pd.DataFrame(grid_search_rf.cv_results_)
print(result)

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       4.311742         0.642632         0.520866          0.520969   
1      17.631970         1.759429         0.574062          0.645420   
2      20.806815         3.863836         0.529359          0.522397   
3      38.603262         4.097031         0.585058          0.667455   
4       4.603935         0.833271         0.520866          0.520866   
5       7.582596         0.902206         0.539572          0.538654   
6       9.046582         1.063145         0.573400          0.625021   
7       7.293392         1.063364         0.563873          0.612716   
8      27.285051         2.804135         0.570444          0.646634   
9      18.579701         1.360250         0.615466          0.685966   

  param_bootstrap param_max_depth param_max_features param_n_estimators  \
0            True               1                  4                100   
1           False               4                  6     



## Evaluate Random Forest Classifier on Test Set

In [49]:
final_model_rf = grid_search_rf.best_estimator_
print(final_model_rf.score(X_test_tr, y_test))

0.6195914739018688


In [51]:
feature_weight_rf = grid_search_rf.best_estimator_.feature_importances_
#print(len(feature_weight_rf))
#num_features = num_pipeline.named_steps['selector'].get_feature_names()
cat_features = list(cat_pipeline.named_steps['encoder'].categories_[0])

1349
