# Bank Project - Resampling

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
import preprocessing as prep
from sklearn.utils import resample

## Fetch Data

In [2]:
df = pd.read_csv('./data/bank-full.csv', sep=';')

In [3]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [("target",prep.TargetTransformer(),'y')])

y_trans = pd.DataFrame(data = ct.fit_transform(df),columns=['y_trans'] ,index = df.index)

In [4]:
df_t = pd.concat([y_trans,df],axis=1).drop(['y'],axis=1)

In [32]:
# Upsample minority classPython

# Separate majority and minority classes
df_majority = df_t[df_t.y_trans==0]
df_minority = df_t[df_t.y_trans==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=101) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [19]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=len(df_minority),    # to match majority class
                                 random_state=101) # reproducible results
 
# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_minority, df_majority_downsampled])

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
cat_features = ['job','marital','education','default','housing','loan','contact','poutcome','month']

ct = ColumnTransformer(
    [
    ("logplus1",prep.LogPlus1Transformer(),['duration','age']),
    ("campaign",prep.CampaignTransformer(),'campaign'),
    ('ohe',OneHotEncoder(sparse=False),cat_features)
    ]
    )



In [75]:
X_train, X_test, y_train, y_test = train_test_split(df_upsampled.drop(['y_trans'],axis=1), df_upsampled.y_trans, stratify = df_upsampled.y_trans)

In [76]:
from sklearn.ensemble import RandomForestClassifier
rtree = RandomForestClassifier()
rtree.fit(ct.fit_transform(X_train),y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [64]:
from sklearn.metrics import precision_recall_fscore_support

In [65]:
from sklearn.model_selection import GridSearchCV

In [77]:
parameters = {'n_estimators':[100,500,1000], 'max_depth':[5,10,20,50,75,100], 'min_samples_leaf':[1,5,10,20]}
rtree = RandomForestClassifier(class_weight={0:1,1:2})
clf = GridSearchCV(rtree, parameters)
clf.fit(ct.fit_transform(X_train),y_train)

KeyboardInterrupt: 

In [None]:
precision_recall_fscore_support(y_test, clf.predict(ct.fit_transform(X_test)))

In [None]:
precision_recall_fscore_support?