# Set up and Preprocessing

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

### Read the dataset

In [62]:
data = pd.read_excel('fruit_data.xlsx')

### Drop unnecessary columns

In [63]:
data.drop(columns=['Unnamed: 0'], inplace=True)

### Clean categorical data

In [64]:
size_replacements = {'Largee': 'Large'}
color_replacements = {'Yellow1': 'Yellow'}

data['size'].replace(size_replacements, inplace=True)
data['color'].replace(color_replacements, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['size'].replace(size_replacements, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['color'].replace(color_replacements, inplace=True)


### Understanding the size of the dataset

In [66]:
data.shape

(200, 7)

### Understanding the uniqueness of the data

In [67]:
data.nunique()

fruit_type             3
color                  8
size                   4
weight                81
fruit_type_encoded     3
color_encoded          8
size_encoded           4
dtype: int64

### Finding missing values

In [68]:
data.isnull().sum()

fruit_type            0
color                 0
size                  0
weight                0
fruit_type_encoded    0
color_encoded         0
size_encoded          0
dtype: int64

### Description of dataset

In [69]:
data.describe()

Unnamed: 0,weight,fruit_type_encoded,color_encoded,size_encoded
count,200.0,200.0,200.0,200.0
mean,59.04722,0.98,3.97,1.175
std,46.697739,0.826447,2.346622,1.029404
min,1.0,0.0,0.0,0.0
25%,8.142752,0.0,2.0,0.0
50%,63.109415,1.0,4.0,1.0
75%,94.367815,2.0,6.0,2.0
max,250.0,2.0,7.0,3.0


### Encode categorical variables

In [65]:

le_fruit = LabelEncoder()
le_color = LabelEncoder()
le_size = LabelEncoder()

data['fruit_type_encoded'] = le_fruit.fit_transform(data['fruit_type'])
data['color_encoded'] = le_color.fit_transform(data['color'])
data['size_encoded'] = le_size.fit_transform(data['size'])

# Model Training

### Prepare features X and target y

In [None]:
X = data[['color_encoded', 'size_encoded', 'weight']]
y = data['fruit_type_encoded']

### Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)