# Comparison of Machine Learning Models using PyCaret

We start by installing the library PyCaret, as follows

In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.0-py3-none-any.whl (485 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/485.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.9/485.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-1.1.3.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collectin

Then we load the data in a DataFrame format.

In [12]:
import pandas as pd
from sklearn import datasets

iris = datasets.load_iris()

df = pd.DataFrame(
    iris.data,
    columns=iris.feature_names
    )

df['target'] = iris.target

# Map targets to target names
target_names = {
    0:'setosa',
    1:'versicolor',
    2:'virginica'
}

#df['target_names'] = df['target'].map(target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Now, we can compare several ML models using the function `compare_models` from PyCaret.

In [15]:
# import classification module
from pycaret.classification import *

# init setup
clf1 = setup(df, target = 'target')

# return best model
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,1294
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(150, 5)"
4,Transformed data shape,"(150, 5)"
5,Transformed train set shape,"(105, 5)"
6,Transformed test set shape,"(45, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9809,1.0,0.9809,0.9847,0.9801,0.9707,0.9732,0.038
qda,Quadratic Discriminant Analysis,0.9718,0.9976,0.9718,0.9738,0.9714,0.9571,0.9585,0.035
lr,Logistic Regression,0.97,0.9969,0.97,0.977,0.9696,0.9548,0.9587,0.775
ada,Ada Boost Classifier,0.9618,0.9951,0.9618,0.9724,0.9586,0.9415,0.9483,0.118
gbc,Gradient Boosting Classifier,0.9618,0.9919,0.9618,0.9724,0.9586,0.9415,0.9483,0.274
lightgbm,Light Gradient Boosting Machine,0.9618,0.9951,0.9618,0.9724,0.9586,0.9415,0.9483,0.344
knn,K Neighbors Classifier,0.96,0.9901,0.96,0.969,0.9591,0.9394,0.9447,0.049
xgboost,Extreme Gradient Boosting,0.96,0.9893,0.96,0.967,0.9596,0.9398,0.9436,0.064
rf,Random Forest Classifier,0.9518,0.9969,0.9518,0.9649,0.9486,0.9266,0.9346,0.207
nb,Naive Bayes,0.9509,0.9883,0.9509,0.9572,0.9501,0.9255,0.9292,0.038


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [16]:
# return best model based on Recall
best = compare_models(sort = 'Recall') #default is 'Accuracy'

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9809,1.0,0.9809,0.9847,0.9801,0.9707,0.9732,0.035
qda,Quadratic Discriminant Analysis,0.9718,0.9976,0.9718,0.9738,0.9714,0.9571,0.9585,0.036
lr,Logistic Regression,0.97,0.9969,0.97,0.977,0.9696,0.9548,0.9587,0.053
ada,Ada Boost Classifier,0.9618,0.9951,0.9618,0.9724,0.9586,0.9415,0.9483,0.143
gbc,Gradient Boosting Classifier,0.9618,0.9919,0.9618,0.9724,0.9586,0.9415,0.9483,0.364
lightgbm,Light Gradient Boosting Machine,0.9618,0.9951,0.9618,0.9724,0.9586,0.9415,0.9483,0.214
knn,K Neighbors Classifier,0.96,0.9901,0.96,0.969,0.9591,0.9394,0.9447,0.05
xgboost,Extreme Gradient Boosting,0.96,0.9893,0.96,0.967,0.9596,0.9398,0.9436,0.062
rf,Random Forest Classifier,0.9518,0.9969,0.9518,0.9649,0.9486,0.9266,0.9346,0.19
nb,Naive Bayes,0.9509,0.9883,0.9509,0.9572,0.9501,0.9255,0.9292,0.035


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [19]:
best