# Dry Beans Classification

## Solutions

* Build models
* Evaluate models
* Tune Hyperparameter
* Select Optimal Model


## Load Libraries

In [None]:
# Load General Libraries
import pandas as pd
import numpy as np
import os

# Preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Load Model Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Hyperparameter tuning Labraries
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Load metrics libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Load Dataset

In [None]:
# folder_path = "/content/drive/MyDrive/Colab Notebooks/dscourse/dry_bins_classification"
folder_path = './'
# file_path = os.path.join(folder_path, "Dry_Bean_Dataset.xlsx")
# df = pd.read_excel(file_path)
# file_path = os.path.join(folder_path, "db_class_1.csv")
# file_path = os.path.join(folder_path, "db_class_corr.csv")
file_path = os.path.join(folder_path, "db_class_var.csv")
df = pd.read_csv(file_path)

In [None]:
# Show 1st 5 rows
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,ConvexArea,EquivDiameter,Class
0,28395,610.291,208.178117,173.888747,28715,190.141097,SEKER
1,28734,638.018,200.524796,182.734419,29172,191.27275,SEKER
2,29380,624.11,212.82613,175.931143,29690,193.410904,SEKER
3,30008,645.884,210.557999,182.516516,30724,195.467062,SEKER
4,30140,620.134,201.847882,190.279279,30417,195.896503,SEKER


## Encode the target

In [None]:
le = LabelEncoder()
df_en = df.copy()
df_en["Class"] = le.fit_transform(df_en["Class"])

In [None]:
# Sanity check
df_en.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,ConvexArea,EquivDiameter,Class
0,28395,610.291,208.178117,173.888747,28715,190.141097,5
1,28734,638.018,200.524796,182.734419,29172,191.27275,5
2,29380,624.11,212.82613,175.931143,29690,193.410904,5
3,30008,645.884,210.557999,182.516516,30724,195.467062,5
4,30140,620.134,201.847882,190.279279,30417,195.896503,5


In [None]:
df_en["Class"].unique()

array([5, 0, 1, 2, 4, 6, 3])

## Split the Data

In [None]:
# Separate independent and target features
X= df_en.iloc[:, :-1]
y = df_en.iloc[:, -1]

In [None]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [None]:
# Scaling the data(Standardize)
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
# Scaling the data(Normalize)
ms = MinMaxScaler()
X_train= ms.fit_transform(X_train)
X_test = ms.transform(X_test)

## Model Building and Evaluation

In [None]:
def build_evaluate(model):
  # Train the model
  model.fit(X_train, y_train)

  # Predict
  y_pred= model.predict(X_test)

  # Evaluate scores
  print(f"Accuracy: {accuracy_score(y_test, y_pred): .2f}")
  print(f"Precision: {precision_score(y_test, y_pred, average='macro'): .2f}")
  print(f"Recall: {recall_score(y_test, y_pred, average='macro'): .2f}")
  print(f"F1 Score: {f1_score(y_test, y_pred, average='macro'): .2f}")

In [None]:
# Try Decision Tree
dt = DecisionTreeClassifier()
build_evaluate(dt)

Accuracy:  0.88
Precision:  0.90
Recall:  0.89
F1 Score:  0.89


In [None]:
# K-Nearest Neighbors
knn= KNeighborsClassifier()
build_evaluate(knn)

Accuracy:  0.91
Precision:  0.92
Recall:  0.92
F1 Score:  0.92


In [None]:
# RandomForrest
rf= RandomForestClassifier()
build_evaluate(rf)

Accuracy:  0.91
Precision:  0.92
Recall:  0.92
F1 Score:  0.92


In [None]:
# XGBoost
xgb= XGBClassifier()
build_evaluate(xgb)

Accuracy:  0.91
Precision:  0.92
Recall:  0.92
F1 Score:  0.92


## Hyperparameter Tuning

In [None]:
def tune_hyperparameter_rs(model, param):
  # Setup tuning
  rs = RandomizedSearchCV(estimator= model,
                          param_distributions= param,
                          verbose= 1)
  # Train the model
  rs.fit(X_train, y_train)
  print(f"Beast Accuracy:{rs.best_score_}")
  return rs.best_params_


In [None]:
def tune_hyperparameter_gs(model, param):
  # Setup tuner
  gs = GridSearchCV(estimator= model,
                    param_grid= param,
                    cv= 5,
                    verbose= 1)
  # Train the model
  gs.fit(X_train, y_train)

  print(f"Best Score:{gs.best_score_: .2f}")

  return gs.best_params_

In [None]:
param_dict= {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "min_samples_split": [ 2, 3, 4, 5],
    "criterion": ["gini", "entropy"],
    "max_leaf_nodes": [None, 2, 4, 5]
}
rf_ht= RandomForestClassifier()
best_params = tune_hyperparameter_rs(rf_ht, param_dict)
print(best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Beast Accuracy:0.9229429215105919
{'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_depth': 10, 'criterion': 'entropy'}


In [None]:
param_dict= {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "min_samples_split": [ 2, 3, 4, 5],
    "criterion": ["gini", "entropy"],
    "max_leaf_nodes": [None, 2, 4, 5]
}
rf_ht= RandomForestClassifier()
best_params = tune_hyperparameter_gs(rf_ht, param_dict)
print(best_params)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
