# Bank Project - Resampling

In [4]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
import preprocessing as prep
from sklearn.utils import resample

## Fetch Data

In [6]:
df = pd.read_csv('./data/bank-full.csv', sep=';')

In [7]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [("target",prep.TargetTransformer(),'y')])

y_trans = pd.DataFrame(data = ct.fit_transform(df),columns=['y_trans'] ,index = df.index)

df_t = pd.concat([y_trans,df],axis=1).drop(['y'],axis=1)

## Resampling

In [9]:
# Separate majority and minority classes
df_majority = df_t[df_t.y_trans==0]
df_minority = df_t[df_t.y_trans==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, replace=False ,n_samples=len(df_minority),random_state=101)
 
# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_minority, df_majority_downsampled])

# Upsample minority class
df_minority_upsampled = resample(df_minority,replace=True,n_samples=len(df_majority),random_state=101)
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [26]:
print('Dataset Size: ' + str(len(df_t)))
print('Downsampled Size: ' + str(len(df_downsampled)))
print('Upsampled Size: ' + str(len(df_upsampled)))

Dataset Size: 45211
Downsampled Size: 10578
Upsampled Size: 79844


In [27]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df_upsampled.drop(['y_trans'],axis=1), df_upsampled['y_trans'], test_size = 0.25,random_state=101, stratify = df_upsampled.y_trans)

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rtree = RandomForestClassifier()

In [54]:
cat_features = ['job','marital','education','default','housing','loan','contact','poutcome','month']

In [55]:
ct = ColumnTransformer(
    [
    ("logplus1",prep.LogPlus1Transformer(),['duration','age']),
    ("campaign",prep.CampaignTransformer(),'campaign'),
    ('ohe',OneHotEncoder(sparse=False),cat_features)
    ]
    )

In [59]:
rtree.fit(ct.fit_transform(X_train),y_train)

ValueError: could not convert string to float: '(0.999, 2.0]'