In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

import urllib.request as request

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import OneSidedSelection

In [None]:
# UCI Contraceptive method choice Dataset Download
# https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data'

In [None]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'data')
os.makedirs(data_dir, exist_ok=True)

In [None]:
save_fname = os.path.join(data_dir, 'cmc.data')
request.urlretrieve(url, save_fname)

In [None]:
cols = [
    'Wife_Age', 'Wife_Education', 
    'Husband_Education', 'Children',
    'Religion', 'Working', 'Occupation',
    'LivingIndex', 'Media_Exposure', 'Target'
]
df = pd.read_csv(save_fname, header=None, names=cols)

In [None]:
df = df[df['Target']<3]

In [None]:
df

In [None]:
cols_num = ['Wife_Age', 'Children']
cols_cat = ['Wife_Education', 'Husband_Education',
            'Working','Religion', 'Occupation',
            'LivingIndex', 'Media_Exposure']
col_target = ['Target']

print(f"X_Features(Categorical): \n {cols_cat} \n")
print(f"X_Features(Numercial): \n {cols_num} \n")
print(f"Y_Features(Categorical): \n {col_target}")

In [None]:
for c in cols_cat:
    df = pd.concat([df, pd.get_dummies(df[c], 
                            prefix=c, 
                            drop_first=True)], axis=1)
    df = df.drop([c], axis=1)

In [None]:
tmp = df.drop(col_target, axis=1)

In [None]:
X_tsne= TSNE(n_components=2, verbose=1, n_iter=2000, random_state=2022, n_jobs=2).fit_transform(tmp)

In [None]:
X_tsne = pd.DataFrame(X_tsne, index=list(df.index), columns=['t-SNE_1', 't-SNE_2'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tsne, df[col_target], train_size=0.80)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
cm_baseline = confusion_matrix(y_test, y_pred_test)

In [None]:
cm_baseline

In [None]:
accuracy_score(y_test, y_pred_test)

In [None]:
x_min, x_max = X_train.values[:, 0].min() - 1, X_train.values[:, 0].max() + 1
y_min, y_max = X_train.values[:, 1].min() - 1, X_train.values[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

In [None]:
z = model.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)

In [None]:
y_major = y_train[y_train['Target']==1]
y_minor = y_train[y_train['Target']==2]

X_major = X_train.loc[list(y_major.index), :]
X_minor = X_train.loc[list(y_minor.index), :]

In [None]:
plt.figure(figsize=(10, 10))
plt.contourf(xx, yy, z, alpha=0.1, colors=['pink', 'blue'])
plt.scatter(X_major.values[:, 0],
            X_major.values[:, 1],
            c='hotpink', s=20, edgecolors='black', label='Majority class')
plt.scatter(X_minor.values[:, 0],
            X_minor.values[:, 1],
            c='blue', s=20, edgecolors='black', label='Minority class')
plt.xlabel("t-SNE dim1")
plt.xlabel("t-SNE dim2")
plt.legend()
plt.show()