# Library

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder



# EDA

In [2]:
df = pd.read_csv("/kaggle/input/dermatology-dataset-classification/dermatology_database_1.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   erythema                             366 non-null    int64 
 1   scaling                              366 non-null    int64 
 2   definite_borders                     366 non-null    int64 
 3   itching                              366 non-null    int64 
 4   koebner_phenomenon                   366 non-null    int64 
 5   polygonal_papules                    366 non-null    int64 
 6   follicular_papules                   366 non-null    int64 
 7   oral_mucosal_involvement             366 non-null    int64 
 8   knee_and_elbow_involvement           366 non-null    int64 
 9   scalp_involvement                    366 non-null    int64 
 10  family_history                       366 non-null    int64 
 11  melanin_incontinence                 366 non-

In [4]:
description = df.describe().transpose()

# Display the complete description
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(description)

                                     count      mean       std  min  25%  50%  \
erythema                             366.0  2.068306  0.664753  0.0  2.0  2.0   
scaling                              366.0  1.795082  0.701527  0.0  1.0  2.0   
definite_borders                     366.0  1.549180  0.907525  0.0  1.0  2.0   
itching                              366.0  1.366120  1.138299  0.0  0.0  1.0   
koebner_phenomenon                   366.0  0.633880  0.908016  0.0  0.0  0.0   
polygonal_papules                    366.0  0.448087  0.957327  0.0  0.0  0.0   
follicular_papules                   366.0  0.166667  0.570588  0.0  0.0  0.0   
oral_mucosal_involvement             366.0  0.377049  0.834147  0.0  0.0  0.0   
knee_and_elbow_involvement           366.0  0.614754  0.982979  0.0  0.0  0.0   
scalp_involvement                    366.0  0.519126  0.905639  0.0  0.0  0.0   
family_history                       366.0  0.125683  0.331946  0.0  0.0  0.0   
melanin_incontinence        

In [5]:
df.head()

Unnamed: 0,erythema,scaling,definite_borders,itching,koebner_phenomenon,polygonal_papules,follicular_papules,oral_mucosal_involvement,knee_and_elbow_involvement,scalp_involvement,...,disappearance_granular_layer,vacuolisation_damage_basal_layer,spongiosis,saw_tooth_appearance_retes,follicular_horn_plug,perifollicular_parakeratosis,inflammatory_mononuclear_infiltrate,band_like_infiltrate,age,class
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3


In [6]:
df['class'].value_counts()

1    112
3     72
2     61
5     52
4     49
6     20
Name: class, dtype: int64

# Preprocess

In [7]:
mask = df.isin(['?'])

# Count "?" values per column
column_counts = mask.sum()

column_counts

erythema                               0
scaling                                0
definite_borders                       0
itching                                0
koebner_phenomenon                     0
polygonal_papules                      0
follicular_papules                     0
oral_mucosal_involvement               0
knee_and_elbow_involvement             0
scalp_involvement                      0
family_history                         0
melanin_incontinence                   0
eosinophils_infiltrate                 0
PNL_infiltrate                         0
fibrosis_papillary_dermis              0
exocytosis                             0
acanthosis                             0
hyperkeratosis                         0
parakeratosis                          0
clubbing_rete_ridges                   0
elongation_rete_ridges                 0
thinning_suprapapillary_epidermis      0
spongiform_pustule                     0
munro_microabcess                      0
focal_hypergranu

In [8]:
# Drop rows with "?" values
df = df[~mask.any(axis=1)]

# Feature Extraction

In [9]:
# Age Binning
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age_bin'] = pd.cut(df['age'], bins=[0, 12, 18, 65, 120],
                         labels=['Child', 'Teenager', 'Adult', 'Elderly'])

# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Fit and transform the 'agebin' data using the encoder
df[['age_bin']] = encoder.fit_transform(df[['age_bin']])

In [10]:
# Feature Scaling
scaler = MinMaxScaler()
numerical_cols = ['erythema', 'scaling', 'exocytosis', 'acanthosis', 'hyperkeratosis']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [11]:
# Feature Aggregation
aggregated_features = df[['erythema', 'scaling', 'exocytosis']].mean(axis=1)
df['aggregated_features'] = aggregated_features

In [12]:
# Domain-Specific Transformations
df['log_erythema'] = np.log1p(df['erythema'])
df['scaling_squared'] = df['scaling'] ** 2

In [13]:
# Display the modified dataset
df.head()

Unnamed: 0,erythema,scaling,definite_borders,itching,koebner_phenomenon,polygonal_papules,follicular_papules,oral_mucosal_involvement,knee_and_elbow_involvement,scalp_involvement,...,follicular_horn_plug,perifollicular_parakeratosis,inflammatory_mononuclear_infiltrate,band_like_infiltrate,age,class,age_bin,aggregated_features,log_erythema,scaling_squared
0,0.666667,0.666667,0,3,0,0,0,0,1,0,...,0,0,1,0,55,2,0.0,0.777778,0.510826,0.444444
1,1.0,1.0,3,2,1,0,0,0,1,1,...,0,0,1,0,8,1,1.0,0.777778,0.693147,1.0
2,0.666667,0.333333,2,3,1,3,0,3,0,0,...,0,0,2,3,26,3,0.0,0.444444,0.510826,0.111111
3,0.666667,0.666667,2,0,0,0,0,0,3,2,...,0,0,3,0,40,1,0.0,0.444444,0.510826,0.444444
4,0.666667,1.0,2,2,2,2,0,2,0,0,...,0,0,2,3,45,3,0.0,0.666667,0.510826,1.0


In [14]:
mask = df.isnull()

# Count null values per column
column_counts = mask.sum()

column_counts

erythema                               0
scaling                                0
definite_borders                       0
itching                                0
koebner_phenomenon                     0
polygonal_papules                      0
follicular_papules                     0
oral_mucosal_involvement               0
knee_and_elbow_involvement             0
scalp_involvement                      0
family_history                         0
melanin_incontinence                   0
eosinophils_infiltrate                 0
PNL_infiltrate                         0
fibrosis_papillary_dermis              0
exocytosis                             0
acanthosis                             0
hyperkeratosis                         0
parakeratosis                          0
clubbing_rete_ridges                   0
elongation_rete_ridges                 0
thinning_suprapapillary_epidermis      0
spongiform_pustule                     0
munro_microabcess                      0
focal_hypergranu

In [15]:
df = df.fillna(df.mean())

# Feature Selection

In [16]:
X = df.drop(columns="class")
y = df['class']

In [17]:
# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [18]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores

elongation_rete_ridges                 0.599445
clubbing_rete_ridges                   0.570930
thinning_suprapapillary_epidermis      0.548732
age                                    0.511310
band_like_infiltrate                   0.488438
saw_tooth_appearance_retes             0.477504
vacuolisation_damage_basal_layer       0.476801
melanin_incontinence                   0.464719
spongiosis                             0.463565
focal_hypergranulosis                  0.462631
polygonal_papules                      0.451427
exocytosis                             0.429138
oral_mucosal_involvement               0.427418
knee_and_elbow_involvement             0.413457
fibrosis_papillary_dermis              0.383914
scalp_involvement                      0.350419
munro_microabcess                      0.316549
PNL_infiltrate                         0.307606
koebner_phenomenon                     0.276245
aggregated_features                    0.267482
disappearance_granular_layer           0

In [19]:
top_5_features = mi_scores.head(5).index.tolist()
top_10_features = mi_scores.head(10).index.tolist()
top_25_features = mi_scores.head(25).index.tolist()

In [20]:
X_5_features = df[top_5_features]
X_10_features = df[top_5_features]
X_25_features = df[top_5_features]
y = df['class']

# Modelling with 5 features

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_5_features, y, test_size=0.3, random_state=42)

In [22]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [23]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [24]:
predict = cross_val_predict(estimator = model, X = X_train, y = y_train, cv = 5)
print("Classification Report: \n",classification_report(y_train, predict))

Classification Report: 
               precision    recall  f1-score   support

           1       0.99      1.00      0.99        80
           2       0.44      0.51      0.48        39
           3       0.98      0.98      0.98        51
           4       0.60      0.53      0.56        34
           5       0.83      0.77      0.80        31
           6       0.79      0.73      0.76        15

    accuracy                           0.81       250
   macro avg       0.77      0.76      0.76       250
weighted avg       0.82      0.81      0.81       250



In [25]:
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       0.38      0.24      0.29        21
           3       1.00      0.95      0.97        20
           4       0.35      0.64      0.45        14
           5       0.93      0.82      0.87        17
           6       1.00      0.80      0.89         5

    accuracy                           0.76       108
   macro avg       0.78      0.74      0.75       108
weighted avg       0.79      0.76      0.76       108



# Modelling with 10 features

In [26]:
x_train,x_test, y_train, y_test = train_test_split(X_10_features, y, test_size=0.3, random_state=42)

In [27]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [28]:
predict = cross_val_predict(estimator = model, X = X_train, y = y_train, cv = 5)
print("Classification Report: \n",classification_report(y_train, predict))

Classification Report: 
               precision    recall  f1-score   support

           1       0.99      1.00      0.99        80
           2       0.44      0.49      0.46        39
           3       0.98      0.98      0.98        51
           4       0.61      0.59      0.60        34
           5       0.82      0.74      0.78        31
           6       0.79      0.73      0.76        15

    accuracy                           0.81       250
   macro avg       0.77      0.76      0.76       250
weighted avg       0.82      0.81      0.81       250



In [29]:
y_pred = model.predict(x_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       0.33      0.24      0.28        21
           3       1.00      0.95      0.97        20
           4       0.35      0.64      0.45        14
           5       0.93      0.76      0.84        17
           6       0.67      0.40      0.50         5

    accuracy                           0.73       108
   macro avg       0.71      0.67      0.67       108
weighted avg       0.76      0.73      0.73       108



# Modelling with 25 Features

In [30]:
x_train,x_test, y_train, y_test = train_test_split(X_25_features, y, test_size=0.3, random_state=42)

In [31]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [32]:
predict = cross_val_predict(estimator = model, X = X_train, y = y_train, cv = 5)
print("Classification Report: \n",classification_report(y_train, predict))

Classification Report: 
               precision    recall  f1-score   support

           1       0.99      1.00      0.99        80
           2       0.47      0.54      0.50        39
           3       0.98      0.98      0.98        51
           4       0.65      0.59      0.62        34
           5       0.83      0.77      0.80        31
           6       0.77      0.67      0.71        15

    accuracy                           0.82       250
   macro avg       0.78      0.76      0.77       250
weighted avg       0.83      0.82      0.82       250



In [33]:
y_pred = model.predict(x_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       0.33      0.24      0.28        21
           3       1.00      0.95      0.97        20
           4       0.35      0.64      0.45        14
           5       0.93      0.76      0.84        17
           6       0.67      0.40      0.50         5

    accuracy                           0.73       108
   macro avg       0.71      0.67      0.67       108
weighted avg       0.76      0.73      0.73       108



# Modelling with all Features

In [34]:
x_train,x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [36]:
predict = cross_val_predict(estimator = model, X = X_train, y = y_train, cv = 5)
print("Classification Report: \n",classification_report(y_train, predict))

Classification Report: 
               precision    recall  f1-score   support

           1       0.99      0.99      0.99        80
           2       0.44      0.51      0.48        39
           3       0.98      0.98      0.98        51
           4       0.61      0.56      0.58        34
           5       0.83      0.77      0.80        31
           6       0.79      0.73      0.76        15

    accuracy                           0.81       250
   macro avg       0.77      0.76      0.76       250
weighted avg       0.82      0.81      0.81       250



In [37]:
y_pred = model.predict(x_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       1.00      0.95      0.98        21
           3       1.00      1.00      1.00        20
           4       0.93      1.00      0.97        14
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00         5

    accuracy                           0.99       108
   macro avg       0.99      0.99      0.99       108
weighted avg       0.99      0.99      0.99       108

