In [None]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,Imputer,OneHotEncoder,RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from mlxtend.classifier import StackingCVClassifier
from scipy.stats import randint
import numpy as np
from time import time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report,roc_curve,roc_auc_score

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
%load_ext autoreload
%autoreload 2

## Pipelines

## Credit Risk Assessment: A Classification Problem

In [None]:
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100

In [None]:
def get_classification_results(model,X_train,y_train,X_test,y_test,target_names=None):
    model = model.fit(X_train, y_train)
    print("Training set score: {:.3f}".format(model.score(X_train, y_train)))
    print("Test set score: {:.3f}".format(model.score(X_test, y_test)))
    preds = model.predict(X_test)
    confusion = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n{}".format(confusion))
    print('F1 score = {:.3f}'.format(f1_score(y_test, preds)))
    print('ROC-AUC Score = {:.3f}'.format(roc_auc_score(y_test,preds)))
    if target_names is not None:
        print(classification_report(y_test, preds,target_names=target_names))
    

In [None]:
class myImputer(BaseEstimator, TransformerMixin):
        def __init__(self,strategy='median'):
            print('constructor with strategy {}'.format(strategy))
            self.strategy = strategy
        def fit(self, X, y=None):
            print('fit called')
            self.y = y
            return self
        def transform(self, X):
            X = X.replace(0, np.NaN)
            cols = X.columns
            print('transform called')
            if self.strategy == 'median':
                X = X.fillna(X.median())
            elif self.strategy == 'most_frequent':
                for col in cols:
                    X[col] = X[col].astype('category').cat.codes
                X = X.fillna(X.mode())
            
            return X

In [None]:
def binaryRandomSampler(X,target,sample_type='under'):
    count_class_0, count_class_1 = X[target].value_counts()
    X_class_0 = X[X[target] == 0]
    X_class_1 = X[X[target] == 1]
    if count_class_0 < count_class_1:
        X_lower_class = X_class_0
        X_higher_class = X_class_1
        count_class_lower = count_class_0
        count_class_higher = count_class_1
    else:
        X_lower_class = X_class_1
        X_higher_class = X_class_0
        count_class_lower = count_class_1
        count_class_higher = count_class_0
        
    if sample_type == 'under':
        X_higher_class = X_higher_class.sample(count_class_lower)    
    else:
        X_lower_class = X_lower_class.sample(count_class_higher,replace=True)
    
    X = pd.concat([X_higher_class, X_lower_class], axis=0).reset_index()
    X.drop('index',axis=1,inplace=True)

    print('Random under-sampling:')
    print(X[target].value_counts())
    return X


In [None]:
rf_over = Pipeline([ ("scaler",RobustScaler()),
                      ("model",RandomForestClassifier(n_estimators=200,n_jobs=6,random_state=100))])
rf_under = Pipeline([ ("scaler",RobustScaler()),
                      ("model",RandomForestClassifier(n_estimators=200,n_jobs=6,random_state=100))])

In [None]:
gb_over = Pipeline([ ("scaler",RobustScaler()),
                     ("model",GradientBoostingClassifier(warm_start=True,n_estimators=200,random_state=100))])
gb_under = Pipeline([ ("scaler",RobustScaler()),
                     ("model",GradientBoostingClassifier(warm_start=True,n_estimators=200,random_state=100))])

In [None]:
logistic_over = Pipeline([ ("scaler",RobustScaler()),
                     ("model",LogisticRegression(max_iter=10000,random_state=100))])
logistic_under = Pipeline([ ("scaler",RobustScaler()),
                     ("model",LogisticRegression(max_iter=10000,random_state=100))])

In [None]:
X_test = pd.read_csv('../datasets/give_me_credit/test.csv', low_memory=False,index_col=0)
X_test.drop(['SeriousDlqin2yrs'],axis=1,inplace=True)

In [None]:
def get_data(train_path='../datasets/give_me_credit/train.csv',sample_type='under'):
    X = pd.read_csv(train_path, low_memory=False,index_col=0)
    X.dropna(axis=0,inplace=True)
    if sample_type is not None:
        X = binaryRandomSampler(X,'SeriousDlqin2yrs',sample_type=sample_type)
    y = X['SeriousDlqin2yrs']
    X.drop(['SeriousDlqin2yrs'],axis=1,inplace=True)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y,stratify=y,
                                                          train_size=0.8,random_state=100)
    return X_train,X_valid,y_train,y_valid

In [None]:
X_train_over, X_valid_over, y_train_over, y_valid_over = get_data(sample_type='over')
X_train_under, X_valid_under, y_train_under, y_valid_under = get_data(sample_type='under')

In [None]:
get_classification_results(rf_under,X_train_under,y_train_under,X_valid_under,y_valid_under,target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(rf_over,X_train_over,y_train_over,X_valid_over,y_valid_over,target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(gb_under,X_train_under,y_train_under,X_valid_under,y_valid_under,target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(gb_over,X_train_over,y_train_over,X_valid_over,y_valid_over,target_names=["Low Risk", "High Risk"])

In [None]:
preprocessor = Pipeline([ ("scaler",RobustScaler())])

In [None]:
X_train = preprocessor.fit_transform(X_train_under,y_train_under)
X_valid = preprocessor.fit_transform(X_valid_under,y_valid_under)

In [None]:

params = {'n_estimators':[100,200,300,500],
           "max_features": randint(8,11),
           "min_samples_split": randint(2, 11),
           "min_samples_leaf": randint(1, 11),
         }

kfold = KFold(n_splits=5, shuffle=True, random_state=0)
start = time()
randomSearch_rf = RandomizedSearchCV(RandomForestClassifier(warm_start=True),
                                     param_distributions=params,n_iter=20,
                                     cv=kfold,n_jobs=6)        
randomSearch_rf.fit(X_train,y_train_under)

print('training took {} minutes'.format((time() - start)/60.))


In [None]:
'''
params = {'n_estimators':[100,200,300,500],
           "max_features": randint(8,11),
           "min_samples_split": randint(2, 11),
           "min_samples_leaf": randint(1, 11),
         }

kfold = KFold(n_splits=5, shuffle=True, random_state=0)
start = time()
randomSearch_gb = RandomizedSearchCV(GradientBoostingClassifier(warm_start=True,random_state=100),
                                     param_distributions=params,n_iter=20,
                                     cv=kfold,n_jobs=6)        
randomSearch_gb.fit(X_train,y_train_under)

print('training took {} minutes'.format((time() - start)/60.))
'''


In [None]:
get_classification_results(randomSearch_rf.best_estimator_,X_train,y_train_under,X_valid,y_valid_under,target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(randomSearch_gb.best_estimator_,X_train,y_train,X_valid,y_valid,target_names=["Low Risk", "High Risk"])

In [None]:
get_missing_values_percentage(X_test)

In [None]:
models_over = [('RF_over',rf_over),
           ('GB_over',gb_over),
           ('logistic_over',logistic_over)
         ]

stacked_models_over = tuple([model[1] for model in models_over])

In [None]:
models_under = [ ('RF_under',rf_under),
           ('GB_under',gb_under),
           ('logistic_under',logistic_under)
         ]
stacked_models_under = tuple([model[1] for model in models_under])

In [None]:
stack_gen_over = StackingCVClassifier(classifiers=stacked_models_over,
                                meta_classifier= models_over[2][1],
                                use_features_in_secondary=True)

In [None]:
stack_gen_under = StackingCVClassifier(classifiers=stacked_models_under,
                                meta_classifier= models_under[2][1],
                                use_features_in_secondary=True)

In [None]:
get_classification_results(stack_gen_over,np.array(X_train_over), np.array(y_train_over),
                           np.array(X_valid_over),y_valid_over,
                           target_names=["Low Risk", "High Risk"])

In [None]:
get_classification_results(stack_gen_under,np.array(X_train_under), 
                           np.array(y_train_under),np.array(X_valid_under),
                           y_valid_under,target_names=["Low Risk", "High Risk"])

In [None]:
imp1 = myImputer(strategy='median')
df1 = X_test
imp1 = imp1.fit(df1)
df1 = imp1.transform(df1)
get_missing_values_percentage(df1)
preds_test = stack_gen_over.predict(df1)
pd.DataFrame(preds_test,columns=['Probability']).Probability.value_counts()
output = pd.DataFrame({'Id': X_test.index,
                       'Probability': preds_test})
output.to_csv('submission_credit_4.csv', index=False)