In [20]:
from pathlib import Path
import sys
import pandas as pd
# ensure project root is on sys.path so `scripts` is importable
sys.path.append(str(Path('..').resolve()))
from scripts.image_to_csv import process_dataset
from scripts.split_data import split_csv_streaming
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [25]:
dataset_path = '../assets/image_dataset/tomato_objects_hog.csv'
# process_dataset(dataset_path) # if you want to run it again remove the comment
# the comment added to prevent re-processing the dataset since it takes alot of time

In [None]:
df = pd.read_csv('../assets/image_dataset/tomato_objects_hog.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63956 entries, 0 to 63955
Columns: 1765 entries, f0 to label
dtypes: float64(1764), int64(1)
memory usage: 861.2 MB


In [16]:
df.isnull().all().sum()

np.int64(0)

In [None]:
split_csv_streaming()

In [3]:
# Loading the CSV into a DataFrame
train_df = pd.read_csv('../assets/image_dataset/train_data.csv')

In [4]:
# Separate Features (X) and Labels (y)
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

In [51]:
test_df = pd.read_csv('../assets/image_dataset/test_data.csv')
    
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [52]:
# max_iter=3000 ensures the model has enough time to find the best math
# solver='lbfgs' is the standard, efficient solver
model = LogisticRegression(max_iter=3000, solver='lbfgs', verbose=1)
    
model.fit(X_train, y_train)
print("   Training complete.")

   Training complete.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.9s finished


In [53]:
y_pred = model.predict(X_test)

In [54]:
# Calculate Accuracy
acc = accuracy_score(y_test, y_pred)

print(f"FINAL ACCURACY: {acc * 100:.2f}%")

FINAL ACCURACY: 78.70%


In [57]:
# clasification report
report = classification_report(y_test, y_pred, target_names=['Fresh (0)', 'Rotten (1)'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

   Fresh (0)       0.20      0.00      0.00      2722
  Rotten (1)       0.79      1.00      0.88     10074

    accuracy                           0.79     12796
   macro avg       0.49      0.50      0.44     12796
weighted avg       0.66      0.79      0.69     12796



#### K-means algo

In [5]:
# train K-means algo
kmeans = KMeans(
        n_clusters=2, 
        random_state=42, 
        n_init='auto',
        max_iter=300
    )

y_true = train_df['label']
kmeans.fit(X_train)

0,1,2
,n_clusters,2
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [7]:
# Get the assigned cluster IDs (0 or 1)
generated_labels = kmeans.labels_
generated_labels

array([1, 0, 1, ..., 1, 0, 1], shape=(51160,), dtype=int32)

In [8]:
# Create a Comparison Table (Confusion Matrix)
# This shows how many Fresh/Rotten tomatoes ended up in Cluster 0 vs Cluster 1
comparison = pd.crosstab(
    y_true, 
    generated_labels, 
    rownames=['Actual_Label'], 
    colnames=['KMeans_Cluster']
)
    
# Rename the index for clarity in the print output
comparison.index = ['Fresh (0)', 'Rotten (1)']
    
print(comparison)

KMeans_Cluster      0      1
Fresh (0)        5320   5555
Rotten (1)      19906  20379


In [13]:
try:
    sample_X = X_train.sample(10000, random_state=42)
    sample_preds = kmeans.predict(sample_X)
    score = silhouette_score(sample_X, sample_preds)
    print(f"Silhouette Score: {score:.4f}")
except ValueError:
    print("Not enough data to calculate score.")

Silhouette Score: 0.0634


In [14]:
acc_A = accuracy_score(y_true, generated_labels)
y_kmeans_pred_flipped = 1 - generated_labels
acc_B = accuracy_score(y_true, y_kmeans_pred_flipped)

# The real accuracy is the maximum of the two
final_accuracy = max(acc_A, acc_B)
print(f"K-Means Clustering Accuracy: {final_accuracy * 100:.2f}%")

K-Means Clustering Accuracy: 50.23%


In [16]:
kmeans_train_df = train_df.copy()

In [19]:
X = kmeans_train_df.drop('label', axis=1)
y_true = kmeans_train_df['label']

In [22]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),       # Make numbers comparable
    ('pca', PCA(n_components=50)),      # Reduce 1700 cols -> 50 cols
    ('kmeans', KMeans(n_clusters=2, random_state=42, n_init=10))
])

# Train
pipeline.fit(X)

0,1,2
,steps,"[('scaler', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,50
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,n_clusters,2
,init,'k-means++'
,n_init,10
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [26]:
# We need to access the 'kmeans' step of the pipeline to get labels
y_pred = pipeline.named_steps['kmeans'].labels_

# Calculate "Smart" Accuracy
acc_A = accuracy_score(y_true, y_pred)
acc_B = accuracy_score(y_true, 1 - y_pred) # Flip labels

final_acc = max(acc_A, acc_B)

print(f"Improved Accuracy (with PCA): {final_acc * 100:.2f}%")


Improved Accuracy (with PCA): 50.60%
