In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd
%cd /content/drive/MyDrive/ML2023/data-analysis/
!ls

/content
/content/drive/MyDrive/ML2023/data-analysis
 airline_reviews
 aka_data_analysis
 app2.py
 Bank_Loan_Approval
 Breast_Cancer
 Car
 Children_Anemia
 comp.py
 covid
 creditcard_2023.csv
'Data Description_Metro.pdf'
'dataEnter the name of the output data file for the report_Output.txt'
 data_Output.txt
 data_survey.csv
 Diabete
 doctor
 Drug_Classification
 EV_cars
 Game_of_Thrones_Battles_Deaths
 Heart_Attack
 house
 img
 ItemSales
 main_house.py
 main.py
'MetroPT3(AirCompressor).csv'
 my_dash_class
 NY
 Obesity_CVD
 Obesity_Output.txt
 sample_submission.csv
 spotify-2023.csv
 test.csv
 Traffic_Prediction
 train.csv


# **CLASSIFIER**

*This algorithm will identify the optimal classification machine learning model for a given dataset.*

# Import the helper classes

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import math
from plotly.subplots import make_subplots

from aka_data_analysis.aka_plot import aka_plot, aka_correlation_analysis
from aka_data_analysis.aka_learning import aka_learn,aka_clean,aka_filter

aka_plot = aka_plot()
aka_corr_an = aka_correlation_analysis()
aka_clean = aka_clean()
aka_learn = aka_learn()
aka_filter = aka_filter()
aka_corr_an = aka_correlation_analysis()

In [4]:
import warnings
from sklearn.exceptions import FitFailedWarning
# Filter out the FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [5]:

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np
import pandas as pd


# Import the dataset


The data is provided from: https://www.kaggle.com/datasets/hasibullahaman/traffic-prediction-dataset

In [6]:
df = aka_clean.df_get('Traffic_Prediction/Traffic.csv')

# Clean Data

In [7]:
df.head(5)

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


## Swap the target and the last feature

In [8]:
df = aka_clean.swap_features(df,-1)
df.head()

Invalid feature indices or feat_a is equal to feat_b.


Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


## Drop feature(s)


In [9]:
feat = []
df = aka_clean.drop_feature(df,feat)
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


In [10]:
df.describe()

Unnamed: 0,Date,CarCount,BikeCount,BusCount,TruckCount,Total
count,2976.0,2976.0,2976.0,2976.0,2976.0,2976.0
mean,16.0,68.696573,14.917339,15.27957,15.324933,114.218414
std,8.945775,45.850693,12.847518,14.341986,10.603833,60.190627
min,1.0,6.0,0.0,0.0,0.0,21.0
25%,8.0,19.0,5.0,1.0,6.0,55.0
50%,16.0,64.0,12.0,12.0,14.0,109.0
75%,24.0,107.0,22.0,25.0,23.0,164.0
max,31.0,180.0,70.0,50.0,40.0,279.0


##  Convert categorical variables into numerical representations

In [11]:
mapping,swapMapping = aka_clean.CleaningVar(df)
df = aka_clean.CleaningDF(df,mapping)
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,0,10,0,31,0,4,4,39,0
1,1,10,0,49,0,3,3,55,0
2,2,10,0,46,0,3,6,55,0
3,3,10,0,51,0,2,5,58,0
4,4,10,0,57,6,15,16,94,1


# Search for the most effective ML algorithm to learn the dataset

## Choose the parameters for the searching

<center>


<font size="45">

**Table of the list of machine learning algorithms to use in the search**

|Keys | ML name |
------ |----------------------
|LGC| Logistic Regression |    
|DTC| Decision Tree Classifier|
|KNN| K-Nearest Neighbors |
|SVC| Support Vector Classification |
|GNB| Gaussian Naive Bayes |
|SGD| Stochastic Gradient Descent |
|ABC| AdaBoost classifier|
|RFC| Random Forest Classifier|
|GBC| Gradient Boosting Classifier|

</font>

</center>

In [12]:

mls = ['DTC','GNB','KNN', 'GBC', 'ABC','RFC','LGC']    # Choose the key of the Machine learning algorithm

confidence_interval_limit =   [3]            # Define the limits m of the confidence interval [-m, m] and eliminate the outliers'''

correlation_percentage_threshold = [.9]      # Set the limit of the correlation between the feature to be removed

pre_proc = 'none'                             # Choose between 'XY' to standardize both 'X' and 'Y',
                                              #                'X' to standardize only 'X',
                                              #                'Y' to standardize only 'Y',
                                              #                'none' to not standardize the dataset
disp_dash = 'all'                             # Choose between  'all' to diplay all report of the ML
                                              #                 'sup'  to display the most significant report

mach = 'adv'                                  # Choose between  'adv' to use advanced parameters in the ML model
                                              #                  'none' to use a default parameters

file_name = 'data'                            # Enter the name of the output data file for the report

file_name_scre = 0.85                        # Enter the minimum value of the ML score to be saved in the report

In [13]:
aka_learn.Search_ML(df,mls,mach,pre_proc,confidence_interval_limit,correlation_percentage_threshold,disp_dash,file_name,file_name_scre)

conf_inter  corr_per  size_removed  ML   score      MSE    simul_time(min)
___________________________________________________________________________
  [-3, 3]      0.9     (59, 1)     DTC     100.000     0.000      0.03 
  [-3, 3]      0.9     (59, 1)     GNB     84.132     0.419      0.00 
  [-3, 3]      0.9     (59, 1)     KNN     92.694     0.138      0.12 
  [-3, 3]      0.9     (59, 1)     GBC     99.772     0.009      2.74 
  [-3, 3]      0.9     (59, 1)     ABC     78.881     0.554      0.74 
  [-3, 3]      0.9     (59, 1)     RFC     99.543     0.011      0.22 
  [-3, 3]      0.9     (59, 1)     LGC     88.356     0.233      0.14 


In [14]:
std_inter = [-3,3]
corr_per = 0.9
ml = 'DTC'
pre_proc = 'none'


clf,scre,MSE_,corr_tmp,df_,y_test,y_pred = aka_learn.ML(df,std_inter,corr_per,pre_proc,ml)

## Graph the features that are highly correlated


In [15]:
fig = aka_corr_an.Plot_Correlate_Features(df,list(corr_tmp),400,500,1)
if fig is not None:
    fig.show()

## Visualize the distribution of the filtered dataset

In [16]:
for feat in [0,1,2]:
  df_0 = pd.DataFrame()
  df_0['true'] = df[df.columns[feat]]
  df_0['filtered'] = df_[df_.columns[feat]]
  fig = aka_plot.plot_history_all(df_0)
  fig.show()
  df_0.shape

## Confusion Matrix

In [17]:
y_pred_ = aka_clean.swap_map(y_pred,swapMapping)
y_test_ = aka_clean.swap_map(y_test,swapMapping)
Label = [ str(un) for un in np.unique(pd.concat([y_pred_, y_test_]))]

In [18]:
shw = 1
fig2 =  aka_plot.plot_confusion_matrix(y_test_,y_pred_,Label,shw)
fig2.show()

## Classification Report

In [19]:
shw = 1
fig3 =  aka_plot.plot_classification_report(y_test_,y_pred_,Label,shw)
fig3.show()