In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
dry_bean = fetch_ucirepo(id=602) 
  
# data (as pandas dataframes) 
X = dry_bean.data.features 
y = dry_bean.data.targets 
  
# metadata 
print(dry_bean.metadata) 
  
# variable information 
print(dry_bean.variables) 

In [None]:
import gc # Garbage Collector

import pandas as pd
import numpy as np
import os

# Time Modules
import calendar
from time import time
import datetime
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# Plots

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(18, 12)})
%matplotlib inline

# Statistics 
from scipy.stats import norm
from scipy.stats import zscore
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. **What is the Dry Bean Dataset?**

In [None]:
X.info()

In [None]:
X.head()

In [None]:
X.describe().T

2. **How many instances (rows) and attributes (columns) are present in the dataset?**  

In [None]:
print('Rows:', X.shape[0])
print('Columns:', X.shape[1])

3. **What are the different classes of beans in the dataset?**     

In [None]:
"""This study utilized seven distinct varieties of dry beans, considering characteristics such as form, shape, type, and structure based on market conditions. 
A computer vision system was designed to differentiate these seven registered bean varieties, which share similar traits, to achieve standardized seed classification."""

4. **What are the main features (attributes) used to describe each bean?**  


In [None]:
"""'Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength','AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent','Solidity', 'Roundness', 'Compactness'"""

5. **Are all attributes numerical, or are there categorical attributes as well?**  

In [None]:
numerical_columns = X.select_dtypes(exclude='object').columns
print("Are all numerical features?", len(numerical_columns) == X.shape[1])

6. **What type of classification problem is this dataset used for? (Binary or Multi-class?)**

In [None]:
y['Class'].value_counts(ascending=False)

"""Is a multi-class classification problem because we have to classify 7 types of beans."""

7. **Which machine learning algorithms can be used to classify the bean types?**  


In [None]:
"""
1. Decision Trees.
2. SVM.
3. Random Forest.
"""

8. **Use Histogram plots to understand the numerical features.**  

In [None]:
X['Class'] = y

In [None]:
color_features = list(sns.color_palette("hls", len(numerical_columns)))

fig, axes = plt.subplots(4, 4, figsize=(16, 12), dpi=120)
axes = axes.flatten() 

for i, (x_col, color, ax) in enumerate(zip(numerical_columns, color_features, axes), start=1):
    sns.histplot(data=X, x=x_col, color=color, fill=True, kde=True, hue='Class', element='bars', ax=ax)
    
    ax.set_title(f"Plot {i}: {x_col}", fontweight='bold')


plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.show();

9. **Use Boxplot plots to understand the numerical features.**  

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(16, 12), dpi=120)
axes = axes.flatten() 

for i, (x_col, color, ax) in enumerate(zip(numerical_columns, color_features, axes), start=1):
    sns.boxplot(data=X, x=x_col, color=color,hue='Class', ax=ax)
    
    ax.set_title(f"Plot {i}: {x_col}", fontweight='bold')


plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.show();

10. **Use Correlation plot to understand any relationship between variables.**  

In [None]:
corr = X[numerical_columns].corr()

mask = np.triu(corr)

plt.subplots(figsize=(16, 12), dpi=120)
plt.title('Heatmap of Features Correlation', fontweight='bold')

ax = sns.heatmap(corr, mask=mask, linewidth = 0.5, fmt='.2f', annot=True);

11. **What performance metrics can be used to evaluate classification models trained on this dataset?**

In [None]:
"""
1.F1-score.
2.Confusion Matrix. 
3.Classification Report
"""
from sklearn.metrics import confusion_matrix,classification_report, f1_score

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

12. **Use a Pipeline to preprocess and modeling your data.**

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
y = X['Class'].values
X = X.drop('Class', axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # for cross_val_score

In [None]:
"""
Standard Scaler seems to be a great tool to use in this case to standardize features by removing the mean and scaling to unit variance. 
It is commonly used to ensure that all features contribute equally to a model, 
preventing features with large magnitudes from dominating the learning process.

Principal Component Analysis (PCA) is a powerful dimensionality reduction technique used for various purposes.
For example it helps simplify datasets with many features while retaining most of the important information, making analysis easier and more efficient.
"""

In [None]:
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),        
    ])

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ("pca", PCA()),
                              ('model', RandomForestClassifier())
                             ])

my_pipeline.fit(X_train, y_train)

prediction_pipeline = my_pipeline.predict(X_valid)

In [None]:
print(classification_report(y_valid, prediction_pipeline))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
_ = ConfusionMatrixDisplay.from_estimator(my_pipeline, X_valid, y_valid, display_labels=my_pipeline.classes_);

In [None]:
print("F1-score:", f1_score(y_valid, prediction_pipeline, average='weighted'))

13. **Compare between diffferent models which one is more accurate.**
    > StratifiedKFold strategy is applied.

In [None]:
def stratified_kfold_accuracy(model, X, y, n_splits):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    accuracies = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = f1_score(y_test, y_pred, average='weighted')
        accuracies.append(accuracy)

    mean_accuracy = np.mean(accuracies)
    return f'Mean:{round(mean_accuracy, 3)}\n', f'Standard Deviation: {round(np.std(accuracies), 3)}\n'

In [None]:
for model in [DecisionTreeClassifier(), RandomForestClassifier(), GaussianNB(), KNeighborsClassifier()]:
    print('', f'{model}', stratified_kfold_accuracy(model, X, y, 10))
    print('=='*50)

In [None]:
"""RandomForest seems the best model to use to achieve an higher accuracy."""

14. **Tune hyperparameters using Optuna to improve accuracy with RandomForestClassifier.**  

In [None]:
start = time()
import optuna
from sklearn.ensemble import RandomForestClassifier

optuna.logging.set_verbosity(optuna.logging.WARNING)


print('X_train:', X_train.shape)
print('X_val:', X_valid.shape) 
print('y_train:', y_train.shape)
print('y_val:', y_valid.shape)

def objective(trial):
    
    n_estimators = trial.suggest_int(name="n_estimators", low=100, high=1200, step=50)

    max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) 

    max_depth = trial.suggest_int(name="max_depth", low=1, high=14, step=1)

    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=14, step=1)

    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=2, high=8, step=1)
    
    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }
    model = RandomForestClassifier(random_state=42, **params)
    
    cv_score = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=skf)
    mean_cv_accuracy = cv_score.mean()
    return mean_cv_accuracy

study = optuna.create_study()
study.optimize(objective, n_trials=10)

print('Params:', study.best_params)

# Train a new model using the best parameters
best_model = RandomForestClassifier(random_state=42, **study.best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_valid)

test_acc = f1_score(y_valid, y_pred, average='weighted')

print("Accuracy:", test_acc)
print('Time:', time() - start)