In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import Data

In [2]:
df = pd.read_csv('./data/Dry_Bean_Dataset.csv')

## Data Description

padndas prfiling

In [None]:
# import matplotlib
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, title="Profiling Report")
# profile.to_file("pandasprofinng_output.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'SEKER'')


In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df["Class"].unique()

In [None]:
df.describe()

## Data cleaning

In [None]:
df.isnull().sum()

No missing values

In [None]:
df.info()

In [None]:
class_counts = df["Class"].value_counts()
print(class_counts)

# EDA

In [None]:
indicators = df.columns[:-1]
df.plot(x = "Class", y = indicators, subplots = True, layout = (4, 4),
       figsize = (16, 16), sharex = False, rot = 90)
plt.show()

In [None]:
unique_classes = df.Class.unique()
def draw_conditional_distribution(ax, df, col):
    bins = np.linspace(df[col].min(), df[col].max(), 50)
    for cls in unique_classes:
        ax.hist(df[df.Class == cls][col], alpha=0.5, label=cls, bins=bins)
    ax.set_title(f'Distributions for {col}')
    ax.legend()

In [None]:
fig, ax = plt.subplots(len(df.drop('Class', axis=1).columns) // 4, 4, figsize=(15, 15))
for idx, col in enumerate(df.drop('Class', axis=1).columns):
    draw_conditional_distribution(ax[idx // 4, idx % 4], df, col)
plt.suptitle('Distributions for all features, conditional on target')
plt.tight_layout()
plt.show()

Here you can immediately notice that the Bombay class is very different from the others. These grains are much larger than the others. There are very many features where we can separate objects of this class from objects of another with a probability very close to 1.


In [None]:
# Check outlier of Dry Bean features
plt.figure(figsize = (10, 5))
sns.boxplot(df[indicators])
plt.title("Boxplot of Dry Bean")
plt.xticks(rotation = 90)
plt.show()

In [None]:
removed = df[(df["Area"] >= 100000) & (df["ConvexArea"] >= 100000)]

In [None]:
class_counts = removed["Class"].value_counts()
print(class_counts)

All the instaces from bombay category are outliers also, so we will not consider bobbay class any more

In [None]:
# Delete outlier of "Area" & "ConvexArea"
df = df[(df["Area"] < 100000) | (df["ConvexArea"] < 100000)]

In [None]:
# Check outlier of Dry Bean features
plt.figure(figsize = (10, 5))
sns.boxplot(df[indicators])
plt.title("Boxplot of Dry Bean")
plt.xticks(rotation = 90)
plt.show()

In [None]:
print(df.shape)
print(df["Class"].unique())

In [None]:
def draw_scatterplot(ax, col1, col2, df):
    unique_labels = df.Class.unique()
    for cls in unique_labels:
        filtered = df[df.Class == cls]
        ax.scatter(filtered[col1], filtered[col2], label=cls, alpha=0.2)
    ax.legend()
    ax.set_xlabel(col1)
    ax.set_ylabel(col2)
    ax.set_title(f'Joint scatterplot: {col1} & {col2}')

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
pairs = [('Area', 'Eccentricity'), ('Eccentricity', 'Solidity'), ('Area', 'EquivDiameter'), ('roundness', 'Compactness'),
         ('ShapeFactor1', 'ShapeFactor2'), ('ShapeFactor2', 'ShapeFactor3'), ('ShapeFactor3', 'ShapeFactor4'), ('Compactness', 'Solidity'),
         ('Area', 'Solidity')]
for idx, p in enumerate(pairs):
    draw_scatterplot(ax[idx // 3, idx % 3], *p, df)
plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
plt.figure(figsize = (16, 16))
for i, col in enumerate(indicators, 1):
    plt.subplot(4, 4, i)
    sns.histplot(df[col], kde= True)
    plt.title(f"Distribution of {col} Data")
    plt.tight_layout()
    plt.plot()

Preprocessing

In [None]:

# Correlation of Dry Bran with Class
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Class"] = le.fit_transform(df["Class"])
df_corr = df.corr()
df_corr["Class"].sort_values(ascending = False)



In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(df_corr, cmap = 'RdPu', annot = True, fmt = ".2f")
plt.show()

In [None]:
# Split Train/Test
X = df.iloc[:,:-1]
y = df.iloc[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)


Regression Model

In [None]:
# LogisticRegressor - Not Scaled
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", lr.score(X_test, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_lr))

In [None]:
# LogisticRegressor - Scaled
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_sc, y_train)
y_lr = lr.predict(X_test_sc)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", lr.score(X_test_sc, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_lr))

Random Forest Model

In [None]:
# RandomForestRegressor - Not Scaled
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
y_rfr = rfr.predict(X_test)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", rfr.score(X_test, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_rfr))

In [None]:
# RandomForestRegressor - Scaled
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train_sc, y_train)
y_rfr = rfr.predict(X_test_sc)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", rfr.score(X_test_sc, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_rfr))

Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier - not scaled
dt_classifier = DecisionTreeClassifier()

# Train Decision Tree Classifier
dt_classifier.fit(X_train, y_train)

# Predict using Decision Tree Classifier
y_dt = dt_classifier.predict(X_test)

# Model Score
print("Decision Tree Classifier Score:", dt_classifier.score(X_test, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_dt))


In [None]:
# Create Decision Tree Classifier - scaled
dt_classifier = DecisionTreeClassifier()

# Train Decision Tree Classifier
dt_classifier.fit(X_train_sc, y_train)

# Predict using Decision Tree Classifier
y_dt = dt_classifier.predict(X_test_sc)

# Model Score
print("Decision Tree Classifier Score:", dt_classifier.score(X_test_sc, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_dt))

SVM Model

In [None]:
from sklearn.svm import SVC

# Create SVM Classifier
svm_classifier = SVC()

# Train SVM Classifier Not
svm_classifier.fit(X_train, y_train)

# Predict using SVM Classifier
y_svm = svm_classifier.predict(X_test)

# Model Score
print("SVM Classifier Score:", svm_classifier.score(X_test, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_svm))


In [None]:

# Create SVM Classifier
svm_classifier2 = SVC()

# Train SVM Classifier scaled
svm_classifier2.fit(X_train_sc, y_train)

# Predict using SVM Classifier
y_svm = svm_classifier2.predict(X_test_sc)

# Model Score
print("SVM Classifier Score:", svm_classifier2.score(X_test_sc, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_svm))


- Logistic regression with scaling achieved the highest model score of 91.81% and the lowest mean absolute error of 0.203.


Using Kmeans clustering to delete outliers

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 2)
km.fit(df[indicators])
labels = km.labels_
df_km = df[labels == 1]
df_km.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (10, 5))
sns.boxplot(df_km[indicators])
plt.title("Boxplot of Dry Bean w/o outliers")
plt.xticks(rotation = 90)
plt.show()


In [None]:
# Correlation of Dry Bran with Class
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_km["Class"] = le.fit_transform(df_km["Class"])

In [None]:
# Split Train/Test
X = df_km.iloc[:,:-1]
y = df_km.iloc[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix



# LogisticRegressor - Scaled
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_sc, y_train)
y_lr = lr.predict(X_test_sc)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", lr.score(X_test_sc, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_lr))

# Precision
precision = precision_score(y_test, y_lr, average='macro')
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_lr, average='macro')
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_lr, average='macro')
print("F1 Score:", f1)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_lr)
print("Confusion Matrix:\n", conf_matrix)



Regression Model

In [None]:
# RandomForestRegressor - Scaled
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train_sc, y_train)
y_rfr = rfr.predict(X_test_sc)

# Model Score
from sklearn.metrics import mean_absolute_error
print("Normal Score :", rfr.score(X_test_sc, y_test))
print("Mean Absolute Error :", mean_absolute_error(y_test, y_rfr))



Decision Tree Model

In [None]:
# Create Decision Tree Classifier - scaled
dt_classifier = DecisionTreeClassifier()

# Train Decision Tree Classifier
dt_classifier.fit(X_train_sc, y_train)

# Predict using Decision Tree Classifier
y_dt = dt_classifier.predict(X_test_sc)

# Model Score
print("Decision Tree Classifier Score:", dt_classifier.score(X_test_sc, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_dt))


# Precision
precision = precision_score(y_test, y_dt, average='macro')
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_dt, average='macro')
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_dt, average='macro')
print("F1 Score:", f1)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_dt)
print("Confusion Matrix:\n", conf_matrix)


SVM Model

In [None]:

# Create SVM Classifier
svm_classifier2 = SVC()

# Train SVM Classifier scaled
svm_classifier2.fit(X_train_sc, y_train)

# Predict using SVM Classifier
y_svm = svm_classifier2.predict(X_test_sc)

# Model Score
print("SVM Classifier Score:", svm_classifier2.score(X_test_sc, y_test))

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_svm))

# Precision
precision = precision_score(y_test, y_svm, average='macro')
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_svm, average='macro')
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_svm, average='macro')
print("F1 Score:", f1)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_svm)
print("Confusion Matrix:\n", conf_matrix)


- With outliers removed using KMeans clustering, all models showed significantly improved performance.
- Decision Tree achieved perfect accuracy and F1 score, indicating it perfectly classified all instances.
- Random Forest also performed exceptionally well with a nearly perfect model score and negligible mean absolute error.
- Logistic Regression and SVM achieved high accuracy and precision, indicating robust performance in classification.
