In [1]:
%pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

#Small Dataset
##Data Preparation

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Bank_Personal_Loan_Modelling.xlsx to Bank_Personal_Loan_Modelling.xlsx


In [None]:
df_s = pd.read_excel('Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')
df_s.head()
df_s.columns = df_s.columns.str.replace(' ', '').str.lower() #standardise column names
df_s.ccavg = df_s.ccavg * 12 # income is annual while credit card spening is in month
df_s.isna().sum() #no NA in all rows
df_s.duplicated().sum() #no duplicated rows
df_s.dtypes #all seems fine
df_s.describe() #unbalanced dataset for all bank products, use iterative stratification and min for ZIP is less than 5 digits
df_s.corr() #age and experience is similar
df_s.drop(columns=['experience','id'], inplace=True, axis=1) #remove experience for similarity and id
df_s = df_s[df_s.zipcode.astype(str).str.len()==5]
df_s.head()

Unnamed: 0,age,income,zipcode,family,ccavg,education,mortgage,personalloan,securitiesaccount,cdaccount,online,creditcard
0,25,49,91107,4,19.2,1,0,0,1,0,0,0
1,45,34,90089,3,18.0,1,0,0,1,0,0,0
2,39,11,94720,1,12.0,1,0,0,0,0,0,0
3,35,100,94112,1,32.4,2,0,0,0,0,0,0
4,35,45,91330,4,12.0,2,0,0,0,0,0,1


In [None]:
X_s = df_s.drop(columns=['personalloan','securitiesaccount','cdaccount','creditcard'])
y_s = df_s[['personalloan','securitiesaccount','cdaccount','creditcard']]
#X_train_s, y_train_s, X_test_s, y_test_s = iterative_train_test_split(X_s.to_numpy(), y_s.to_numpy(), test_size=0.2)
mlss = MultilabelStratifiedKFold(n_splits=5)
for train_i, test_i in mlss.split(X_s, y_s):
    X_train_s, X_test_s = X_s.iloc[train_i], X_s.iloc[test_i]
    y_train_s, y_test_s = y_s.iloc[train_i], y_s.iloc[test_i]
    break

#Large Dataset
##Data Preparation

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)

df_l = pd.concat([X,y], axis=1)

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [None]:
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
job_encoded = pd.DataFrame(onehotencoder.fit_transform(df_l[['job']]), columns = onehotencoder.get_feature_names_out(['job'])).astype(int)
df_l = pd.concat([df_l.drop(columns='job', axis=1),job_encoded], axis = 1)

martial_encoded = pd.DataFrame(onehotencoder.fit_transform(df_l[['marital']]), columns = onehotencoder.get_feature_names_out(['marital'])).astype(int)
df_l = pd.concat([df_l.drop(columns='marital', axis=1),martial_encoded], axis=1)

education_encoder=OrdinalEncoder(categories=[['primary','secondary','tertiary']], handle_unknown='use_encoded_value', unknown_value=-1)
df_l['education'] = education_encoder.fit_transform(df_l[['education']]).astype(int)
df_l[['default','housing','loan']] = df_l[['default','housing','loan']].replace({'yes':1, 'no':0})

contact_encoded = pd.DataFrame(onehotencoder.fit_transform(df_l[['contact']]), columns = onehotencoder.get_feature_names_out(['contact'])).astype(int)
df_l = pd.concat([df_l.drop(columns='contact', axis=1),contact_encoded], axis=1)

start_year = 2008
df_l['year'] = start_year
df_l['month'] = pd.to_datetime(df_l['month'], format='%b').dt.month
for i in range(1, len(df_l)):
    if df_l.loc[i, 'month'] < df_l.loc[i - 1, 'month']:
        df_l.loc[i, 'year'] = df_l.loc[i - 1, 'year'] + 1
    else:
        df_l.loc[i, 'year'] = df_l.loc[i - 1, 'year']
contact_date = pd.to_datetime(df_l['day_of_week'].astype(str) + '-' + df_l['month'].astype(str) + '-' + df_l['year'].astype(str), format='%d-%m-%Y')
last_contact_date = contact_date.max()
df_l['days_from_contact'] = (last_contact_date - contact_date).dt.days

df_l['y'] = df_l['y'].replace({'yes':1, 'no':0})

  df_l[['default','housing','loan']] = df_l[['default','housing','loan']].replace({'yes':1, 'no':0})
  df_l['y'] = df_l['y'].replace({'yes':1, 'no':0})


In [None]:
df_l.isna().sum() #remove poutcome with 36959 nan
df_l.drop(columns=['poutcome', 'job_nan', 'contact_nan'], inplace=True, axis=1)
df_l.duplicated().sum() #no duplicated rows
df_l.dtypes #all seems fine
df_l.rename(columns={'job_admin.': 'job_admin', 'y': 'term_deposit', 'housing': 'housing_loan', 'loan': 'personal_loan'}, inplace=True)
df_l.describe() #unbalanced dataset for loan and y, use iterative stratification
df_l.head()

Unnamed: 0,age,education,default,balance,housing_loan,personal_loan,day_of_week,month,duration,campaign,...,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,year,days_from_contact
0,58,2,0,2143,1,0,5,5,261,1,...,0,0,0,0,1,0,0,0,2008,926
1,44,1,0,29,1,0,5,5,151,1,...,0,1,0,0,0,1,0,0,2008,926
2,33,1,0,2,1,1,5,5,76,1,...,0,0,0,0,1,0,0,0,2008,926
3,47,-1,0,1506,1,0,5,5,92,1,...,0,0,0,0,1,0,0,0,2008,926
4,33,-1,0,1,0,0,5,5,198,1,...,0,0,0,0,0,1,0,0,2008,926


In [None]:
X_l = df_l.drop(columns=['housing_loan','personal_loan','term_deposit'])
y_l = df_l[['housing_loan','personal_loan','term_deposit']]
mlss = MultilabelStratifiedKFold(n_splits=5)
for train_i, test_i in mlss.split(X_l, y_l):
    X_train_l, X_test_l = X_l.iloc[train_i], X_l.iloc[test_i]
    y_train_l, y_test_l = y_l.iloc[train_i], y_l.iloc[test_i]
    break