In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/dermatology-dataset-classification/dermatology_database_1.csv")

In [None]:
pd.set_option('display.max_columns', None)
df.head(5)

# Checking dataset for NaN values and replacing them if present

Checking in which columns and how many times "?"(which stands for NaN in this dataset) value is met in each column

In [None]:
nan = "?"
for i in df.columns:
    if nan in df[i].unique():
        print(i, "\n", "_"*10)
        print(df[i].value_counts()[nan])
        print()

In [None]:
df['age'].replace("?", np.nan, inplace=True)
si = SimpleImputer(missing_values=np.nan,strategy='mean')
si.fit(np.asarray(df['age'].astype(float)).reshape(-1, 1))
df['age'] = si.transform(np.asarray(df['age'].astype(float)).reshape(-1, 1))

In [None]:
for i in df.columns:
    print(i, " ", df[i].nunique())

In [None]:
def plots(df, x):
    f,ax=plt.subplots(1,2,figsize=(25,10))
    Group_data = df.groupby(x)
    sns.barplot(x = Group_data['age'].mean().index, y = Group_data['age'].mean().values,ax= ax[0],  palette = 'mako')
    for container in ax[0].containers:
        ax[0].bar_label(container,color='black',size=20)

    palette_color = sns.color_palette('summer')
    plt.pie(x = df[x].value_counts(),
            labels=df[x].value_counts().index,
            autopct='%.0f%%',
            shadow=True,
            colors= palette_color)
    plt.suptitle("Bar plots and pie charts for {}".format(x))
    plt.show()

# Data Visualisation And Class Imbalance Detection

Portrayal of barplots for Age in the data grouped by categorical variables

In [None]:
for i in df.columns:
    if i != 'age':
        plots(df, i)

# Checking for outliers

In [None]:
sns.histplot(df, x='age')

In [None]:
sns.boxplot(df, x='age')

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Label Encoder

Even though the data is encoded and it would work out just fine with other models, but not with XGBoost. Hence we will make every model trained on Encoded data in order to avoid complications

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

# Training models pipeline

In [None]:
class Models:
    def __init__(self, x, y, status_all=False):
        self.x = x
        self.y = y
        self.train = None
        self.test = None
        if status_all:
            counter = Counter(self.y)
            print("Before ", counter)
            sm = SMOTE()
            self.x, self.y = sm.fit_resample(self.x, self.y)
            x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, random_state=42, test_size=0.2)
            self.train = (x_train, y_train)
            self.test = (x_test, y_test)
            print("After ", Counter(self.y))
            print()
        else:
            sm = SMOTE()
            print("")
            x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, random_state=42, test_size=0.2)
            print("Before ", Counter(y_train))
            self.train = sm.fit_resample(x_train, y_train)
            self.test = (x_test, y_test)
            print("After ", Counter(self.train[1]))
            print()
        
    def training(self, model, name):
        x_train, y_train = self.train
        model.fit(x_train, y_train)
        self.evaluate(model, name, 2)
        return model
    
    def evaluate(self, model, name, _round=2):
        x, y = self.test
        y_pred = model.predict(x)
        acc = accuracy_score(y_pred, y)
        acc *= 100
        acc = round(acc, _round)
        print("{}: {}%".format(name, acc))

# Selecting models and their hyperparameters

In [None]:
rfc = RandomForestClassifier(n_estimators=110, max_depth=20, criterion='entropy')
abc = AdaBoostClassifier(n_estimators=60, learning_rate=0.1)
dtc = DecisionTreeClassifier(criterion='entropy',max_depth=20)
lgr = LogisticRegression(multi_class='multinomial')
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=10)
xgb = XGBClassifier(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
models = [rfc, abc, dtc, lgr, gnb, knn, xgb]
names = ['Random Forest', 'Ada Boost', 'Decision Trees',
        'Logistic Regression', 'Naive Bayes', 'KNN',
        'XGBoost']

# SMOTE whole sample vs training sample only

I made whole pipeline for the sole purpose of comparing the results of models' performances on different data.

One data will have its **trained** sample resampled using SMOTE technique

Another version will have **whole** data sample resampled using SMOTE. Which means testing data is resampled using SMOTE as well

And I will show that there are no drastic changes in accuracies, no matter how we apply SMOTE

In [None]:
trained = []
ml = Models(x, y, False)
for i, j in zip(models, names):
    trained += [ml.training(i, j)]
    print()

In [None]:
trained_all = []
ml = Models(x, y, True)
for i, j in zip(models, names):
    trained += [ml.training(i, j)]
    print()