## Import Dependancies

In [1]:
# Import Libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection   import train_test_split
from sklearn.ensemble          import RandomForestClassifier
from xgboost                   import XGBClassifier
from sklearn.metrics           import accuracy_score, f1_score,classification_report, confusion_matrix, precision_score, recall_score
from sklearn.feature_selection import SelectFromModel
from sklearn.tree              import DecisionTreeClassifier
from sklearn.svm               import SVC
from sklearn.ensemble          import ExtraTreesClassifier

## Read Data

In [2]:
# Read the data with read_csv method
train = pd.read_csv("Training.csv")
test = pd.read_csv("Testing.csv")

# Drop the Unnamed: 133 column --> Null and Useless Column
train.drop(columns='Unnamed: 133', inplace=True)

## EDA (Exploratory Data Analysis)

In [3]:
print(train.shape)
train.head()

(4920, 133)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


We have **4920 Rows** and **133 Columns or feature**

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


All the DataFrame Columns Have integer value (Binary 1/0) except 1 columns has object value which is the target column or feature

## Check Missing Values

In [5]:
train.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

We can't see all columns so we will do the below

In [6]:
train.isnull().sum().sum()

0

We have 0 null value of the whole dataset

## Check The Balance of the Target Feature

In [7]:
train['prognosis'].nunique()

41

The target feature `prognosis` has 41 unique value

In [8]:
train['prognosis'].value_counts()

Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
Allergy      

Check the distribution for the target feature the `prognosis` <br>
We can see that all the diseases have the same amount of obesrvations. The values are **Normally Distributed** <br>
It is called **Balanced Data**

## Feature with 1 unique value
We will delete those features because all their values are the same and we will not take any benefits from them.

In [9]:
# Columns Names
cols = train.columns.to_list()

# Empty list 
del_cols = []

for col in cols:
    # Check the the number of unique values are 1
    if train[col].nunique() == 1:
        # append to the deleted columns list
        del_cols.append(col)

# print those columns
del_cols

['fluid_overload']

Just one column, the `fluid_overload` column. <br> We will drop it.

In [10]:
train.drop(columns=del_cols, inplace=True)
test.drop(columns = del_cols, inplace=True)

In [11]:
train.describe()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,...,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0
mean,0.137805,0.159756,0.021951,0.045122,0.021951,0.162195,0.139024,0.045122,0.045122,0.021951,...,0.021951,0.021951,0.021951,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171
std,0.34473,0.366417,0.146539,0.207593,0.146539,0.368667,0.346007,0.207593,0.207593,0.146539,...,0.146539,0.146539,0.146539,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Feature Selection 
### Based On Feature Importance For Tree Models

In [12]:
def feature_selection(model):
    
    # Split the Training and Testing Sets
    # Training 
    X_train = train.drop(columns= 'prognosis')
    y_train = train['prognosis']
    
    # Testing
    X_test = test.drop(columns='prognosis')
    y_test = test['prognosis']
    
    # Build the Model
    model = model
    model.fit(X_train, y_train)
    
    # Make Prediction
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    # Feature Importance 
    feature_importances = model.feature_importances_ * 100
    feature_importances = pd.Series(feature_importances, index= X_train.columns).sort_values(ascending=False)
    selected_features = feature_importances[feature_importances > 1]
    
    # Filter the data into the selected features
    X_train = X_train[selected_features.index]
    X_test = X_test[selected_features.index]
    
    return X_train, X_test

### Using Random Forest

In [13]:
X_train, X_test = feature_selection(RandomForestClassifier())
print("Number of selected Feature: ", X_train.shape[1])

Number of selected Feature:  30


### Using Extra Tree

In [14]:
X_train, X_test = feature_selection(ExtraTreesClassifier())
print("Number of selected Feature: ", X_train.shape[1])

Number of selected Feature:  36


In [15]:
X_train, X_test = feature_selection(ExtraTreesClassifier())
y_train, y_test = train['prognosis'], test['prognosis']

X_train.shape, X_test.shape

((4920, 37), (42, 37))

# Model Selection

### Accuracies Function

In [16]:
def accuracies(model):
    
    # Training Set
    y_pred_train = model.predict(X_train)
    acc_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score( y_train, y_pred_train, average='micro')
    
    # Validation Set
    y_pred_test = model.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='micro')
    
    # Dictionary for all the values
    values_dict = {"Accuracy": [acc_train, acc_test],
                   "F1 Score": [f1_train, f1_test]}
    
    # DataFrame for the values
    metric_df = pd.DataFrame.from_dict(values_dict, orient='index')
    metric_df.rename(columns={0:"Training Set", 1:"Testing Set"}, inplace=True)
    
    # Return the df
    return metric_df

### Random Forest

In [17]:
model = RandomForestClassifier(200)
model.fit(X_train, y_train)

accuracies(model)

Unnamed: 0,Training Set,Testing Set
Accuracy,0.885366,0.904762
F1 Score,0.885366,0.904762


## XGBoost

In [18]:
model = XGBClassifier(250, verbosity=0)
model.fit(X_train, y_train)
accuracies(model)

Unnamed: 0,Training Set,Testing Set
Accuracy,0.885366,0.904762
F1 Score,0.885366,0.904762


## SVM

In [19]:
model = SVC()
model.fit(X_train, y_train)
accuracies(model)

Unnamed: 0,Training Set,Testing Set
Accuracy,0.885366,0.904762
F1 Score,0.885366,0.904762


I will choose the `XGBoost Classifier` for making predictions

# Evaluation

### Bigger Accuracies Function

In [20]:
def accuracies(model):
    
    # Training Set
    y_pred_train = model.predict(X_train)
    acc_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score( y_train, y_pred_train, average='micro')
    prec_train = precision_score(y_train, y_pred_train, average='micro')
    rec_train = recall_score(y_train, y_pred_train, average='micro')
    
    # Testing Set
    y_pred_test = model.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='micro')
    prec_test = precision_score(y_test, y_pred_test, average='micro')
    rec_test = precision_score(y_test, y_pred_test, average='micro')
    
    # Dictionary for all the values
    values_dict = {"Accuracy": [acc_train, acc_test],
                   "Precision": [prec_train, prec_test],
                   "Recall": [rec_train, rec_test],
                   "F1 Score": [f1_train, f1_test]}
    
    # DataFrame for the values
    metric_df = pd.DataFrame.from_dict(values_dict, orient='index')
    metric_df.rename(columns={0:"Training Set", 1:"Testing Set"}, inplace=True)
    
    # Return the df
    return metric_df.T

In [21]:
model = XGBClassifier(250, verbosity =0)
model.fit(X_train, y_train)
accuracies(model)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Training Set,0.885366,0.885366,0.885366,0.885366
Testing Set,0.904762,0.904762,0.904762,0.904762


From the Table above we can see the 
* Accuracy
* Precision
* Recall
* F1-Score

**Below will include more information**

### Classification Error

In [23]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       0.50      1.00      0.67         1
                                   Acne       0.33      1.00      0.50         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       0.00      0.00      0.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

## Save the model as PKL file

In [42]:
# Make a full training set

# The Full Features
X = pd.concat([X_train, X_test], join='inner', ignore_index=True)

# The Full Target
y = pd.DataFrame(pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_test)], join='inner' , ignore_index=True))

In [43]:
model.fit(X, y)

pickle.dump(model, open('model.sav', 'wb'))

I will deploy the model online using `Streamlit` Library <br>
I will continue with the deploy phase on the **main.py** file.