# Welcome to My Notebook


# In this notebook we are going to predict the class of Date Fruit with help of various Input Features.

In [None]:
# import all the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score, recall_score, f1_score

import warnings 
warnings.filterwarnings("ignore")

In [None]:
# Read the dataset
dataframe= pd.read_excel("/kaggle/input/date-fruit-datasets/Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx")
dataframe.head()

In [None]:
# dataframe info
dataframe.info()

In [None]:
# describe the dataset
dataframe.describe()

In [None]:
# Check the duplicate values in the dataset
dataframe.duplicated().sum()

In [None]:
# Check Is there any null values in the dataset
dataframe.isnull().sum()

In [None]:
dataframe.shape

In [None]:
# Let see the correlation between features/columns
correlation_matrix=dataframe.corr()
correlation_matrix

# Lets visulalise the correlation matrix with the help of Heatmap

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(correlation_matrix, annot=True, fmt=".1f", cmap="YlGnBu")
plt.show()

# Check the dataset is balanced or not

In [None]:
dataframe.Class.value_counts()

# Exploratory Data Analysis

In [None]:
dataframe["Class"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="Blue")

# Lets see the Data Distribution of all the columns

In [None]:
dataframe.hist(bins=12, figsize=(16,16), grid=True)
plt.suptitle("Data Distribution of all the columns")
plt.show()

# Lets Detect Outliers Using Box Plot
Here we can only see the outliers in the feature names[Area, Perimeter, Major_axis, Minor_axis, Solidity, Convex_area, eccentricity, Compacteness and Roundness]

In [None]:
fig=plt.figure(figsize=(19,19))

ax=fig.add_subplot(331)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["AREA"], hue=None ,color='c',ax=ax)
ax.set_title('Class vs Area', fontsize=16)

ax=fig.add_subplot(332)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["PERIMETER"], hue=None ,color='red',ax=ax)
ax.set_title('Class vs Perimeter ', fontsize=16)

ax=fig.add_subplot(333)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["MAJOR_AXIS"], hue=None ,color='yellow',ax=ax)
ax.set_title('Class vs Major_Axis', fontsize=16)

ax=fig.add_subplot(334)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["MINOR_AXIS"], hue=None ,color='blue',ax=ax)
ax.set_title('Class vs Minor_Axis', fontsize=16)

ax=fig.add_subplot(335)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["SOLIDITY"], hue=None ,color='purple',ax=ax)
ax.set_title('Class vs Solidity', fontsize=16)

ax=fig.add_subplot(336)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["CONVEX_AREA"], hue=None ,color='violet',ax=ax)
ax.set_title('Class vs Convex_Area', fontsize=16)

ax=fig.add_subplot(337)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["ECCENTRICITY"], hue=None ,color='green',ax=ax)
ax.set_title('Class vs Eccentricity', fontsize=16)

ax=fig.add_subplot(338)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["COMPACTNESS"], hue=None ,color='grey',ax=ax)
ax.set_title('Class vs Compactess', fontsize=16)

ax=fig.add_subplot(339)
sns.boxplot(data=dataframe, x=dataframe["Class"], y=dataframe["ROUNDNESS"], hue=None ,color='orange',ax=ax)
ax.set_title('Class vs Roundness', fontsize=16)


plt.show()

# Drop the columns that has Correlation equal to 1(Multicolinearity)

In [None]:
dataframe.drop(["PERIMETER","MAJOR_AXIS","MINOR_AXIS","CONVEX_AREA","MeanRR","ALLdaub4RR","EntropyRG"], axis=1, inplace=True)

# Divide the Dataset into Train and Test Set

In [None]:
def train_test_split_data(dataframe,target,test_size, random_state):
    x_train,x_test, y_train, y_test= train_test_split(dataframe.drop([target], axis=1),
                                                      dataframe[target],
                                                      test_size=test_size,
                                                      random_state=random_state,
                                                      stratify=dataframe[target]
                                                      )
    
    return x_train,x_test, y_train, y_test

In [None]:
x_train,x_test, y_train, y_test= train_test_split_data(dataframe,target="Class",test_size=0.3, random_state=42)

In [None]:
x_train.shape,x_test.shape, y_train.shape, y_test.shape

# Get the Numerical and Categorical Columns list

In [None]:
def get_numerical_and_categorical_columns(dataframe):
    
    numerical_cols = []
    categorical_cols = []
    for column in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[column]):
            numerical_cols.append(column)
        else:
            categorical_cols.append(column)
            
    return numerical_cols, categorical_cols

In [None]:
 numerical_cols, categorical_cols=get_numerical_and_categorical_columns(dataframe)

# Lets Detect the outliers in the Training Data And Remove it

In [None]:
def Winsorization_Method(columns, x_train, y_train , a, b):
    outliers=[]

    for col in columns:
        q1= np.percentile(x_train[col], a)
        q2= np.percentile(x_train[col],b)
        
        for pos in range(len(x_train)):
            if x_train[col].iloc[pos]>q2 or x_train[col].iloc[pos]<q1:
                outliers.append(pos) 
                
    outliers= set(outliers)                   # remove the duplicates from the outliers
    outliers= list(outliers)
    
    ratio= round(len(outliers)/len(x_train)*100, 2)                       # Ratio of outliers
    x_train.drop(x_train.index[outliers], inplace=True)    # remove the outliers from the training dataset
    y_train.drop(y_train.index[outliers], inplace=True)
    
    
    
    return ratio, x_train, y_train

In [None]:
ratio_of_outliers,x_train,y_train= Winsorization_Method(numerical_cols, x_train, y_train, a=0.2, b=99.2)

In [None]:
ratio_of_outliers

In [None]:
x_train.shape, y_train.shape

# Data Preprocessing
1. All the features are numerical except the calss, Lets do the Numerical Encoding for numerical features
2. And for class feature use the Categorical Encoding

# Numerical Encoding (Using Robust Scaler)

In [None]:
robust_scaler= RobustScaler()
x_train=robust_scaler.fit_transform(x_train)
x_test=robust_scaler.transform(x_test)

# Lets do the Label Encoding on Target Variable

In [None]:
le = LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

# Modelling

In [None]:
def modelling(x_train, x_test, y_train, y_test):
    # create the empty list to store the results 
    precision=[]
    recall=[]
    f1=[]


    # Lets create the list of models
    models=[LogisticRegression(),
    GaussianNB(),
    SVC(kernel="linear"),
    KNeighborsClassifier(n_neighbors=32),
    DecisionTreeClassifier(criterion="gini"),
    RandomForestClassifier(n_estimators=200,criterion="gini"),
    XGBClassifier()
     ]

    # Let iterate over the list of models and train and predict it
    for model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        precision.append(precision_score(y_test, y_pred,average="micro"))
        recall.append(recall_score(y_test, y_pred, average="micro"))
        f1.append(f1_score(y_test, y_pred, average="micro"))



    model_names = ['LogisticRegression','GaussianNB','SVC','KNeighborsClassifier','DecisionTreeClassifier','RandomForestClassifier','XGBClassifier']
    result_df = pd.DataFrame({'Recall':recall, 'Precision':precision, 'F1_Score':f1},index=model_names)
    result_df=result_df.sort_values(by="Recall", ascending=False)
    return result_df

In [None]:
result_df= modelling(x_train, x_test, y_train, y_test)
result_df

# Lets visualize the Result

In [None]:
result_df.plot(kind="barh", figsize=(10, 7), grid=True).legend(bbox_to_anchor=(1.2,1));

🚀 Hi Kagglers,

I hope you enjoyed exploring my notebook! If you found the work insightful or helpful, I kindly invite you to show your support by giving it an upvote. Your appreciation fuels my motivation to continue sharing valuable insights with the community.

Moreover, I believe in continuous improvement, and your feedback plays a crucial role in making my work even better. If you have any suggestions, comments, or thoughts, please don't hesitate to leave them in the comments section. Let's learn and grow together!

Thank you for being a part of this amazing journey. Here's to more exciting collaborations and knowledge sharing ahead. 🌟