In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Step 1: Load data
avocado_data=pd.read_csv('avocado.csv')
avocado_data.head()

Unnamed: 0,firmness,hue,saturation,brightness,color_category,sound_db,weight_g,size_cm3,ripeness
0,14.5,19,40,26,black,34,175,261,ripe
1,71.7,53,69,75,green,69,206,185,pre-conditioned
2,88.5,60,94,46,dark green,79,220,143,hard
3,93.8,105,87,41,dark green,75,299,140,hard
4,42.5,303,58,32,purple,63,200,227,breaking


In [30]:
# Step 2: Split into features and label
X = avocado_data.drop(columns=['ripeness'])
Y = avocado_data['ripeness']
X.head()

Unnamed: 0,firmness,hue,saturation,brightness,color_category,sound_db,weight_g,size_cm3
0,14.5,19,40,26,black,34,175,261
1,71.7,53,69,75,green,69,206,185
2,88.5,60,94,46,dark green,79,220,143
3,93.8,105,87,41,dark green,75,299,140
4,42.5,303,58,32,purple,63,200,227


In [15]:
# Step 3: Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [22]:
# Step 4: Identify numeric and categorical columns
categorical_cols = ['color_category'] 
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

In [None]:
# Step 5: Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

In [25]:
# Step 6: Apply transformations
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [26]:
# Step 7: Create the model
model = LogisticRegression()

In [28]:
# Step 8: Train the model
model.fit(X_train_scaled, Y_train)

In [29]:
#Step 9: Evaluate model on testing set:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [36]:
#Step 10: Sample new data
new_data_np = np.array([
    [88.0, 65, 78, 48, 'dark green', 77, 255, 185],  # Sample 1
    [30.5, 290, 50, 33, 'purple', 59, 175, 155]      # Sample 2
], dtype=object)

original_feature_order = ['firmness', 'hue', 'saturation', 'brightness', 'color_category', 'sound_db', 'weight_g', 'size_cm3']

new_data_df = pd.DataFrame(new_data_np, columns=original_feature_order)
print(new_data_df)

# Transform new data
new_data_scaled = preprocessor.transform(new_data_df)

# Predict ripeness for both samples
predictions = model.predict(new_data_scaled)


  firmness  hue saturation brightness color_category sound_db weight_g  \
0     88.0   65         78         48     dark green       77      255   
1     30.5  290         50         33         purple       59      175   

  size_cm3  
0      185  
1      155  


In [37]:
#Step 11: Print predicted classes
for i, pred in enumerate(predictions, start=1):
    print(f"Avocado Sample {i} → Predicted Ripeness: {pred}")

Avocado Sample 1 → Predicted Ripeness: hard
Avocado Sample 2 → Predicted Ripeness: breaking
