In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.neighbors import DistanceMetric
from sklearn.model_selection import GridSearchCV, train_test_split
import warnings
warnings.filterwarnings("ignore")
from imblearn.under_sampling import NearMiss
from collections import Counter

import random
from numpy.random import seed
import tensorflow as tf
from kmodes.kprototypes import KPrototypes

In [3]:
# function to reset all RNG's to seed 23
def reset_random_seeds():
   tf.random.set_seed(23) # tensorflow's seed
   np.random.seed(23) # numpy's seed
   random.seed(23) # python's seed

# import data
data = pd.read_csv (r'bank-additional-full.csv', sep = ';', engine= 'python')
#data = data.head(1000)
length = data.shape[0]
data.head()

# select variables
cats_to_use = ['age', 'default', 'contact', 'month', 'previous', 'poutcome', 'emp.var.rate', 'euribor3m', 'nr.employed', 'y']
data = data[cats_to_use]

# 'age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
#       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
#       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
#       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'


In [None]:
data.head()

In [None]:

# save lists of categorical and numerical variables
cat_cols = ['default', 'contact', 'month', 'poutcome', 'y']
num_cols = ['age', 'previous', 'emp.var.rate', 'euribor3m', 'nr.employed']

# create column transformer to 1 one-hot-encode cat vars and 2 noralise num vars
# ct = make_column_transformer(
#     (OneHotEncoder(drop='first'), cat_cols), # drop first column (reference)
#     (StandardScaler(), num_cols),
# )

# transform base table (pandas df -> numpy array)
#base = ct.fit_transform(data)

# convert base table to p.df for ease of use (numpy array -> pandas df)
# base_temp = pd.DataFrame(base, columns=ct.get_feature_names_out().tolist())
# base_temp

# # seperate base table into X and y and convert to numpy array (base pandas df -> y numpy array + X numpy array)
# y = base_temp['onehotencoder__y_yes'].values
# X = base_temp.drop(columns=['onehotencoder__y_yes']).values

y = data["y"].values
X = data.drop(columns=['y']).values

# save and check dimensions of X 
(X_length, X_vars) = X.shape
X_length, X_vars

# reset RNG's
reset_random_seeds()

(X_length, X_vars) = X.shape
X_length, X_vars

In [None]:
data.head()
train_features, test_features, train_targets, test_targets = train_test_split(X, y.ravel(), test_size=0.2,
                                                                                  random_state=23)
 
# for col in data.columns:
#     print(col)
train_features

In [None]:
# wcss = []
# for i in range(1, 20):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
#     kmeans.fit(train_features)
#     wcss.append(kmeans.inertia_)
# plt.plot(range(1, 20), wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

In [None]:
categorical_features_idx = [1,2,3,5]

In [None]:
mark_array=train_features

In [None]:
kproto = KPrototypes(n_clusters=2, verbose=2, max_iter=20).fit(mark_array, categorical=categorical_features_idx)

In [None]:
mark_array=train_features

In [None]:
#prediction
clusters = kproto.predict(mark_array, categorical=categorical_features_idx)

In [None]:
print(clusters.mean())

In [None]:
train = train_features
print(train.shape)

In [None]:
train = np.insert(train, 9, clusters, axis=1)
train = np.insert(train, 10, train_targets, axis=1)
train.shape

In [None]:
df_train = pd.DataFrame(train)
df_train.columns = ["age", "default", "contact", "month","previous", "poutcome", "emp.var.rate", "euribor3m", "nr.employed", 'cluster', 'y']

#print(df_train)

In [None]:
# save lists of categorical and numerical variables
cat_cols = ['default', 'contact', 'month', 'poutcome', 'cluster', 'y']
num_cols = ['age', 'previous', 'emp.var.rate', 'euribor3m', 'nr.employed']

# create column transformer to 1 one-hot-encode cat vars and 2 noralise num vars
ct = make_column_transformer(
    (OneHotEncoder(drop='first'), cat_cols), # drop first column (reference)
    (StandardScaler(), num_cols)
)

# transform base table (pandas df -> numpy array)
base = ct.fit_transform(df_train)

# convert base table to p.df for ease of use (numpy array -> pandas df)
base_temp = pd.DataFrame(base, columns=ct.get_feature_names_out().tolist())
base_temp

In [None]:
base_temp.to_csv(r'bank-additional-trainclusters.csv', index = False)

# test clusters

In [None]:
mark_array_test=test_features

In [None]:
#prediction
clusters_pred = kproto.predict(mark_array_test, categorical=categorical_features_idx)

In [None]:
print(clusters_pred.mean())

In [None]:
test = test_features
print(test.shape)

In [None]:
test = np.insert(test, 9, clusters_pred, axis=1)
test = np.insert(test, 10, test_targets, axis=1)
test.shape

In [None]:
df_test = pd.DataFrame(test)
df_test.columns = ["age", "default", "contact", "month","previous", "poutcome", "emp.var.rate", "euribor3m", "nr.employed", 'cluster', 'y']

#print(df_test)

# transform base table (pandas df -> numpy array)
base_test = ct.fit_transform(df_test)

# convert base table to p.df for ease of use (numpy array -> pandas df)
base_temp_test = pd.DataFrame(base_test, columns=ct.get_feature_names_out().tolist())

In [None]:
base_temp_test.insert(loc=1, column='onehotencoder__default_yes', value=0.0)
base_temp_test

In [None]:
base_temp_test.to_csv(r'bank-additional-testclusters.csv', index = False)