### Preparing the Training Dataset

In [20]:
import pandas as pd 

df = pd.read_csv("Training.csv")
df = df.drop(["Unnamed: 133","fluid_overload"], axis = 1)

print("Shape of dataset {}".format(df.shape))

X_train = df.drop("prognosis",axis = 1)
X_train = X_train.values

y_train = df["prognosis"]
y_train = y_train.values

df.head()

Shape of dataset (4920, 132)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


### Preparing the Test Dataset

In [21]:
df = pd.read_csv("Testing.csv")
df = df.drop(["fluid_overload"], axis = 1)

print("Shape of dataset {}".format(df.shape))

X_test = df.drop("prognosis",axis = 1)
X_test = X_test.values

y_test = df["prognosis"]
y_test = y_test.values

df.head()

Shape of dataset (42, 132)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


### No data split id needed because we have a training dataset file and a test dataset file

In [8]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Model Selection

In [22]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import numpy as np

model_params = {
    "svm" : {
        "model" : SVC(),
        "params" : {
            "C"         : list(np.linspace(1,100,20)),
            "kernel"    : ["linear", "poly", "rbf", "sigmoid"],
            "gamma"     : ["scale","auto"]
        }
    },
    "random_forest":{
        "model": RandomForestClassifier(),
        "params" : {
            "n_estimators"      : [i*10+1 for i in range(20)],
            "criterion"         : ["gini", "entropy"],
            "min_samples_leaf"  : [i+5 for i in range(20)]
        }
    },
    "logistic_regression" : {
        "model"     : LogisticRegression(solver="liblinear", multi_class="auto"),
        "params"    : {
            "C"     : list(np.linspace(1,100,20))
        }
    },
    "knc" : {
        "model": KNeighborsClassifier(),
        "params" : {
            "n_neighbors" : [i+5 for i in range(5)],
            "weights" : ["distance", "uniform"],
            "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"]
        }
    }
}

In [23]:
scores = []

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import numpy as np

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False, scoring='accuracy')
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    mts = clf.cv_results_
    mts = mts.get("mean_test_score")
    mts = np.mean(mts)  

    scores.append({
        'model': model_name,
        "best_mean_accuracy_in_training" :mts,
        "best_accuracy_in_testing" :acc,
        "best_params": clf.best_params_
    })

### Conclusion:
- Support Vector Classifier, Logistic Regression and K-nearest neighbor models got an accuracy of a 1.0
- We could choose the simplest model : Logistic Regression Model
- Have in mind that the Test Set represent less than 1% of the training dataset
- The distribution of the test dataset was not compared to the training dataset

In [24]:
df = pd.DataFrame(scores, columns = ["model","best_mean_accuracy_in_training", "best_accuracy_in_testing", "best_params"])
df = df.sort_values(by = ["best_accuracy_in_testing", "best_mean_accuracy_in_training"], ascending = False)
df.head()

Unnamed: 0,model,best_mean_accuracy_in_training,best_accuracy_in_testing,best_params
0,svm,1.0,1.0,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}"
2,logistic_regression,1.0,1.0,{'C': 1.0}
3,knc,1.0,1.0,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh..."
1,random_forest,0.989954,0.97619,"{'criterion': 'gini', 'min_samples_leaf': 5, '..."
