# Models
just seeing how well the features dataset does (no images yet)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, cross_val_score


In [11]:
features_df = pd.read_csv('images.csv')

X = features_df.drop(columns=['Image_Name', 'Target', 'Path'])
y = features_df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) 
# test_size=0.4 decreases training accuracy for log reg a little

## Logistic Regression


In [12]:
model = LogisticRegression(max_iter=1000, C=0.1)  # increase max_iter if the model struggles to converge
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        46

    accuracy                           1.00        97
   macro avg       1.00      1.00      1.00        97
weighted avg       1.00      1.00      1.00        97

Confusion Matrix:
 [[51  0]
 [ 0 46]]


In [13]:
y_train_pred = model.predict(X_train)
accuracy_score(y_train, y_train_pred) # high training error -> overfitting

1.0

In [14]:
precision, recall, thresholds = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
print(precision, recall, thresholds)

[0.4742268  0.47916667 0.48421053 0.4893617  0.49462366 0.5
 0.50549451 0.51111111 0.51685393 0.52272727 0.52873563 0.53488372
 0.54117647 0.54761905 0.55421687 0.56097561 0.56790123 0.575
 0.58227848 0.58974359 0.5974026  0.60526316 0.61333333 0.62162162
 0.63013699 0.63888889 0.64788732 0.65714286 0.66666667 0.67647059
 0.68656716 0.6969697  0.70769231 0.71875    0.73015873 0.74193548
 0.75409836 0.76666667 0.77966102 0.79310345 0.80701754 0.82142857
 0.83636364 0.85185185 0.86792453 0.88461538 0.90196078 0.92
 0.93877551 0.95833333 0.9787234  1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.        ] [1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         

In [15]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=stratified_kfold)
print("Stratified cross-validation scores:", scores)

Stratified cross-validation scores: [1.         0.98969072 0.98969072 1.         1.        ]


## Multi-Layer Perceptron (MLP)

In [16]:
model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42) # 1 hidden layer, 10 neurons
model.fit(X_train, y_train)



In [17]:
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.9381443298969072


In [18]:
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9072164948453608


In [19]:
from sklearn.model_selection import GridSearchCV

model = MLPClassifier(max_iter=1000, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(50, 30), (100,), (200,), (100, 50)],  # Number of neurons in each layer
    'activation': ['relu', 'logistic', 'tanh'],  # Activation functions
    'learning_rate_init': [0.001, 0.01, 0.1],  # Learning rates
    'solver': ['adam', 'sgd'],  # Optimizer (Adam or SGD)
    'alpha': [0.0001, 0.001, 0.01]  # Regularization parameter
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits




Best Parameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 30), 'learning_rate_init': 0.01, 'solver': 'adam'}
Best Cross-Validation Score: 0.975257731958763


now i try training model with these params
- also scale since i forgo

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_feat_scaled = scaler.fit_transform(X_train)
X_test_feat_scaled = scaler.transform(X_test)

In [21]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(
    activation='relu', 
    alpha=0.001, 
    hidden_layer_sizes=(50, 30), 
    learning_rate_init=0.01, 
    solver='adam', 
    max_iter=1000, 
    random_state=42
) # best features

model.fit(X_train_feat_scaled, y_train)

train_score = model.score(X_train_feat_scaled, y_train)
test_score = model.score(X_test_feat_scaled, y_test)

print(f"Training Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.9897


In [22]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_feat_scaled)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       1.00      0.98      0.99        46

    accuracy                           0.99        97
   macro avg       0.99      0.99      0.99        97
weighted avg       0.99      0.99      0.99        97

[[51  0]
 [ 1 45]]


## use pipeline to scale MLP 
- also try gridsearch 

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Scale features
    ('mlp', MLPClassifier(
        activation='relu',
        alpha=0.001,
        hidden_layer_sizes=(50, 30),
        learning_rate_init=0.01,
        solver='adam',
        max_iter=1000,
        random_state=42
    ))  # Step 2: Train MLP
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
mlp_predictions = pipeline.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predict on the test set
mlp_predictions = pipeline.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, mlp_predictions)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision, Recall, and F1-score (for binary classification)
precision = precision_score(y_test, mlp_predictions, average='binary')  # Use average='micro' or 'macro' for multiclass
recall = recall_score(y_test, mlp_predictions, average='binary')      # Use average='micro' or 'macro' for multiclass
f1 = f1_score(y_test, mlp_predictions, average='binary')              # Use average='micro' or 'macro' for multiclass
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, mlp_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9897
Precision: 1.0000
Recall: 0.9783
F1 Score: 0.9890
Confusion Matrix:
[[51  0]
 [ 1 45]]


In [25]:
# Accuracy on training set
train_accuracy = pipeline.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Accuracy on test set
test_accuracy = pipeline.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Training Accuracy: 1.0000
Test Accuracy: 0.9897


In [27]:
from sklearn.model_selection import cross_val_score
import numpy as np

cross_val_scores = cross_val_score(pipeline, X, y, cv=5)  # 5-fold cross-validation
print(f"Cross-Validated Accuracy: {np.mean(cross_val_scores):.4f}")


Cross-Validated Accuracy: 0.9526


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Scale features
    ('mlp', MLPClassifier(random_state=42))  # Placeholder for MLP
])

# Define hyperparameter grid for the MLPClassifier
param_grid = {
    'mlp__activation': ['relu', 'tanh'],  # Activation function
    'mlp__alpha': [0.0001, 0.001, 0.01, 0.1],  # Regularization strength
    'mlp__hidden_layer_sizes': [(50, 30), (100, 50), (100, 100)],  # Number of layers and units
    'mlp__learning_rate_init': [0.001, 0.01, 0.1],  # Initial learning rate
    'mlp__solver': ['adam', 'sgd'],  # Optimization method
    'mlp__max_iter': [500, 1000]  # Number of iterations
}

# Define GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on the test set
test_accuracy = grid_search.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")




Best Parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100, 50), 'mlp__learning_rate_init': 0.01, 'mlp__max_iter': 500, 'mlp__solver': 'adam'}
Best Cross-Validation Accuracy: 0.9820
Test Accuracy: 1.0000


Best Parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100, 50), 'mlp__learning_rate_init': 0.01, 'mlp__max_iter': 500, 'mlp__solver': 'adam'}
Best Cross-Validation Accuracy: 0.9820
Test Accuracy: 1.0000