In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

dataset = pd.read_csv('riceclass.csv')
X = dataset.drop('Class', axis=1)
y = dataset['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')

print("Metrics before hyperparameter optimization:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Metrics before hyperparameter optimization:
Accuracy: 0.9908136482939632
Precision: 0.9908835502233582
Recall: 0.9908136482939632


In [25]:
# Regularization strength for hyperparameter tuning using grid search. Higher C values, more regularization in the data, but may overfit at higher values of C
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_tuned = best_model.predict(X_test_scaled)

accuracy_tuned = metrics.accuracy_score(y_test, y_pred_tuned)
precision_tuned = metrics.precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = metrics.recall_score(y_test, y_pred_tuned, average='weighted')

print("\nMetrics after hyperparameter optimization:")
print(f"Accuracy: {accuracy_tuned}")
print(f"Precision: {precision_tuned}")
print(f"Recall: {recall_tuned}")



Metrics after hyperparameter optimization:
Accuracy: 0.9986876640419947
Precision: 0.9986906948409741
Recall: 0.9986876640419947


In [28]:
# Feature engineering part, doing polynomial features
for degree in [1, 2, 3, 4, 5, 6]:
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=21)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    #Positive Label is Osmancik for recall and F1. 
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='Osmancik')
    recall = recall_score(y_test, y_pred, pos_label='Osmancik')
    f1 = f1_score(y_test, y_pred, pos_label='Osmancik')
    cm = confusion_matrix(y_test, y_pred, labels=['Cammeo', 'Osmancik'])

    print(f"Degree: {degree}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("\n")


Degree: 1
Accuracy: 0.9908
Precision: 0.9863
Recall: 0.9977
F1 Score: 0.9919
Confusion Matrix:
[[324   6]
 [  1 431]]


Degree: 2
Accuracy: 0.9934
Precision: 0.9886
Recall: 1.0000
F1 Score: 0.9942
Confusion Matrix:
[[325   5]
 [  0 432]]


Degree: 3
Accuracy: 0.9961
Precision: 0.9931
Recall: 1.0000
F1 Score: 0.9965
Confusion Matrix:
[[327   3]
 [  0 432]]


Degree: 4
Accuracy: 0.9974
Precision: 0.9954
Recall: 1.0000
F1 Score: 0.9977
Confusion Matrix:
[[328   2]
 [  0 432]]


Degree: 5
Accuracy: 0.9974
Precision: 0.9954
Recall: 1.0000
F1 Score: 0.9977
Confusion Matrix:
[[328   2]
 [  0 432]]


Degree: 6
Accuracy: 0.9987
Precision: 0.9977
Recall: 1.0000
F1 Score: 0.9988
Confusion Matrix:
[[329   1]
 [  0 432]]


