# **CLASSIFIER**

*This algorithm will identify the optimal classification machine learning model for a given dataset.*

# Import the helper classes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import math
from plotly.subplots import make_subplots

from aka_data_analysis.aka_plot import aka_plot, aka_correlation_analysis
from aka_data_analysis.aka_learning import aka_learn,aka_clean,aka_filter

aka_plot = aka_plot()
aka_corr_an = aka_correlation_analysis()
aka_clean = aka_clean()
aka_learn = aka_learn()
aka_filter = aka_filter()
aka_corr_an = aka_correlation_analysis()

In [None]:
import warnings
from sklearn.exceptions import FitFailedWarning
# Filter out the FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np
import pandas as pd


# Import the dataset


The data is provided from: https://www.kaggle.com/datasets/aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster

In [None]:
df = aka_clean.df_get('Obesity_CVD/ObesityDataSet.csv')

# Clean Data

In [None]:
df.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


## Swap the target and the last feature

In [None]:
df = aka_clean.swap_features(df,-1)
df.head()

Invalid feature indices or feat_a is equal to feat_b.


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


## Drop feature(s)


In [None]:
feat = []
df = aka_clean.drop_feature(df,feat)
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


##  Convert categorical variables into numerical representations

In [None]:
mapping,swapMapping = aka_clean.CleaningVar(df)
df = aka_clean.CleaningDF(df,mapping)
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,0,0,2.0,3.0,0,0,2.0,0,0.0,1.0,0,0,0
1,0,21.0,1.52,56.0,0,0,3.0,3.0,0,1,3.0,1,3.0,0.0,1,0,0
2,1,23.0,1.8,77.0,0,0,2.0,3.0,0,0,2.0,0,2.0,1.0,2,0,0
3,1,27.0,1.8,87.0,1,0,3.0,3.0,0,0,2.0,0,2.0,0.0,2,1,1
4,1,22.0,1.78,89.8,1,0,2.0,1.0,0,0,2.0,0,0.0,0.0,1,0,2


# Search for the most effective ML algorithm to learn the dataset

## Choose the parameters for the searching

<center>


<font size="45">

**Table of the list of machine learning algorithms to use in the search**

|Keys | ML name |
------ |----------------------
|LGC| Logistic Regression |    
|DTC| Decision Tree Classifier|
|KNN| K-Nearest Neighbors |
|SVC| Support Vector Classification |
|GNB| Gaussian Naive Bayes |
|SGD| Stochastic Gradient Descent |
|ABC| AdaBoost classifier|
|RFC| Random Forest Classifier|
|GBC| Gradient Boosting Classifier|

</font>

</center>

In [None]:

mls = ['DTC','GNB','KNN', 'GBC', 'ABC','RFC','LGC']    # Choose the key of the Machine learning algorithm

confidence_interval_limit =   [3]            # Define the limits m of the confidence interval [-m, m] and eliminate the outliers'''

correlation_percentage_threshold = [.9]      # Set the limit of the correlation between the feature to be removed

pre_proc = 'none'                             # Choose between 'XY' to standardize both 'X' and 'Y',
                                              #                'X' to standardize only 'X',
                                              #                'Y' to standardize only 'Y',
                                              #                'none' to not standardize the dataset
disp_dash = 'all'                             # Choose between  'all' to diplay all report of the ML
                                              #                 'sup'  to display the most significant report

mach = 'adv'                                  # Choose between  'adv' to use advanced parameters in the ML model
                                              #                  'none' to use a default parameters

file_name = 'data'                            # Enter the name of the output data file for the report

file_name_scre = 0.85                        # Enter the minimum value of the ML score to be saved in the report

In [None]:
aka_learn.Search_ML(df,mls,mach,pre_proc,confidence_interval_limit,correlation_percentage_threshold,disp_dash,file_name,file_name_scre)

conf_inter  corr_per  size_removed  ML   score      MSE    simul_time(min)
___________________________________________________________________________
  [-3, 3]      0.9     (206, 0)     DTC     94.231     0.323      0.05 
  [-3, 3]      0.9     (206, 0)     GNB     68.706     1.563      0.00 
  [-3, 3]      0.9     (206, 0)     KNN     94.580     0.283      0.06 
  [-3, 3]      0.9     (206, 0)     GBC     97.203     0.117      6.24 
  [-3, 3]      0.9     (206, 0)     ABC     35.140     4.649      0.82 
  [-3, 3]      0.9     (206, 0)     RFC     95.979     0.243      0.27 
  [-3, 3]      0.9     (206, 0)     LGC     83.042     0.673      0.18 


In [None]:
std_inter = [-3,3]
corr_per = 0.9
ml = 'GBC'
pre_proc = 'none'

clf,scre,MSE_,corr_tmp,df_,y_test,y_pred = aka_learn.ML(df,std_inter,corr_per,pre_proc,ml)



## Graph the features that are highly correlated


In [None]:
fig = aka_corr_an.Plot_Correlate_Features(df,list(corr_tmp),400,500,1)
if fig is not None:
    fig.show()

Empty list is provided.


## Visualize the distribution of the filtered dataset

In [None]:
for feat in [0,1,2]:
  df_0 = pd.DataFrame()
  df_0['true'] = df[df.columns[feat]]
  df_0['filtered'] = df_[df_.columns[feat]]
  fig = aka_plot.plot_history_all(df_0)
  fig.show()
  df_0.shape

## Confusion Matrix

In [None]:
y_pred_ = aka_clean.swap_map(y_pred,swapMapping)
y_test_ = aka_clean.swap_map(y_test,swapMapping)
Label = [ str(un) for un in np.unique(pd.concat([y_pred_, y_test_]))]

NameError: ignored

In [None]:
shw = 1
fig2 =  aka_plot.plot_confusion_matrix(y_test_,y_pred_,Label,shw)
fig2.show()

## Classification Report

In [None]:
shw = 1
fig3 =  aka_plot.plot_classification_report(y_test_,y_pred_,Label,shw)
fig3.show()