<a href="https://colab.research.google.com/github/a-donat/Benchmarks_PyCaret/blob/main/Predicting_Rice_Type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Set-Up

## I.A. Import Libraries and Download Data

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d mssmartypants/rice-type-classification
! unzip '/content/rice-type-classification.zip' -d '/content/data'

In [None]:
! pip install pycaret

In [3]:
#import matplotlib.pyplot as plt
#import seaborn as sns
import VisualizeDataAbbrev as viz

import numpy as np
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import train_test_split

## I.B. Load Data and Check Data Integrity

In [4]:
ds_df = pd.read_csv("/content/data/riceClassification.csv")

In [5]:
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               18185 non-null  int64  
 1   Area             18185 non-null  int64  
 2   MajorAxisLength  18185 non-null  float64
 3   MinorAxisLength  18185 non-null  float64
 4   Eccentricity     18185 non-null  float64
 5   ConvexArea       18185 non-null  int64  
 6   EquivDiameter    18185 non-null  float64
 7   Extent           18185 non-null  float64
 8   Perimeter        18185 non-null  float64
 9   Roundness        18185 non-null  float64
 10  AspectRation     18185 non-null  float64
 11  Class            18185 non-null  int64  
dtypes: float64(8), int64(4)
memory usage: 1.7 MB


In [6]:
ds_df.nunique()

id                 18185
Area                5343
MajorAxisLength    18185
MinorAxisLength    18185
Eccentricity       18185
ConvexArea          5450
EquivDiameter       5343
Extent             18007
Perimeter          16246
Roundness          18184
AspectRation       18185
Class                  2
dtype: int64

In [7]:
ds_df["Class"].value_counts()

1    9985
0    8200
Name: Class, dtype: int64

In [11]:
ds_df = ds_df[[c for c in list(ds_df) if c != "id"]].copy()

In [12]:
ds_df.describe().round(5)

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
count,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0
mean,7036.49299,151.68075,59.80785,0.91541,7225.81787,94.13295,0.61665,351.60695,0.708,2.59908,0.54908
std,1467.19715,12.3764,10.06165,0.03058,1502.00657,9.90625,0.10439,29.50062,0.06731,0.43484,0.4976
min,2522.0,74.13311,34.40989,0.67665,2579.0,56.66666,0.38324,197.015,0.17459,1.35813,0.0
25%,5962.0,145.67591,51.39315,0.89162,6125.0,87.12666,0.53853,333.99,0.65096,2.20853,0.0
50%,6660.0,153.88375,55.72429,0.92326,6843.0,92.0857,0.60119,353.088,0.70194,2.60297,1.0
75%,8423.0,160.05621,70.15659,0.94137,8645.0,103.55915,0.69566,373.003,0.76928,2.9641,1.0
max,10210.0,183.21143,82.55076,0.96677,11008.0,114.01656,0.88657,508.511,0.90475,3.91184,1.0



# II. Preprocessing

In [13]:
train_df, test_df = train_test_split(
    ds_df, test_size=0.20, stratify=ds_df["Class"], random_state=1)

# III. Create Models

In [14]:
exp_clf101 = setup(data=train_df, target = "Class", session_id=123)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(14548, 11)"
4,Transformed data shape,"(14548, 11)"
5,Transformed train set shape,"(10183, 11)"
6,Transformed test set shape,"(4365, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


In [15]:
best_model = compare_models(fold=5)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9901,0.9987,0.9921,0.9898,0.991,0.98,0.98,0.72
lr,Logistic Regression,0.9899,0.9988,0.9928,0.9888,0.9908,0.9796,0.9796,1.238
gbc,Gradient Boosting Classifier,0.9898,0.9982,0.9918,0.9897,0.9907,0.9794,0.9794,3.192
xgboost,Extreme Gradient Boosting,0.9898,0.9988,0.992,0.9895,0.9907,0.9794,0.9794,1.532
rf,Random Forest Classifier,0.9897,0.9985,0.9916,0.9897,0.9906,0.9792,0.9792,1.82
qda,Quadratic Discriminant Analysis,0.9893,0.9983,0.9943,0.9864,0.9903,0.9784,0.9784,0.112
lightgbm,Light Gradient Boosting Machine,0.9893,0.9991,0.992,0.9886,0.9903,0.9784,0.9784,0.474
ada,Ada Boost Classifier,0.9884,0.9988,0.9911,0.9879,0.9895,0.9766,0.9766,0.898
ridge,Ridge Classifier,0.9877,0.0,0.9968,0.9812,0.9889,0.9752,0.9753,0.056
lda,Linear Discriminant Analysis,0.9873,0.999,0.9961,0.9812,0.9886,0.9744,0.9745,0.084


Processing:   0%|          | 0/65 [00:00<?, ?it/s]