In [None]:
#save images as HOG features
import os
import pandas as pd
from skimage.feature import hog
from skimage.io import imread
from skimage.color import rgb2gray

def extract_hog_features(image_folder, output_csv, 
                         orientations=9, pixels_per_cell=(8, 8), 
                         cells_per_block=(2, 2), filter_keyword="_grayscale"):
    features = []
    image_names = []
    
    for image_name in os.listdir(image_folder):
        if filter_keyword in image_name:
            image_path = os.path.join(image_folder, image_name)
            try:
                image = imread(image_path)
                
                if len(image.shape) == 3:
                    image = rgb2gray(image)
                    
                hog_features = hog(image, 
                                   orientations=orientations, 
                                   pixels_per_cell=pixels_per_cell, 
                                   cells_per_block=cells_per_block, 
                                   block_norm='L2-Hys', 
                                   visualize=False)
                
                features.append(hog_features)
                image_names.append(image_name)
                print(f"Processed: {image_name}")
            except Exception as e:
                print(f"Error processing '{image_name}': {e}")

    df = pd.DataFrame(features)
    df.insert(0, 'image_name', image_names)
    
    df.to_csv(output_csv, index=False)
    print(f"HOG features saved to {output_csv}")

input_folder = 'train_new_ims' 
output_file = 'hog_features_grayscale.csv' 

extract_hog_features(image_folder=input_folder, output_csv=output_file, filter_keyword="_grayscale")


In [None]:
#extract and save histogram features
import os
import pandas as pd
import cv2
import numpy as np

def extract_histogram_features(image_folder, output_csv, bins=8, exclude_keyword="_grayscale"):
    histogram_features = []
    image_names = []

    for image_name in os.listdir(image_folder):
        # Process files that do NOT contain "grey_scale"
        if exclude_keyword not in image_name:
            image_path = os.path.join(image_folder, image_name)
            try:
                image = cv2.imread(image_path)

                if image is None:
                    raise ValueError("Image not loaded correctly.")

                # Convert the image to HSV color space, we can try RGB as well
                hsv_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)

                # Compute the color histogram for H, S, and V channels
                hist_h = cv2.calcHist([hsv_image], [0], None, [bins], [0, 180])
                hist_s = cv2.calcHist([hsv_image], [1], None, [bins], [0, 256])
                hist_v = cv2.calcHist([hsv_image], [2], None, [bins], [0, 256])

                # Normalize the histograms
                hist_h = hist_h / hist_h.sum()
                hist_s = hist_s / hist_s.sum()
                hist_v = hist_v / hist_v.sum()

                hist_features = np.concatenate((hist_h.flatten(), hist_s.flatten(), hist_v.flatten()))

                histogram_features.append(hist_features)
                image_names.append(image_name)
                print(f"Processed: {image_name}")
            except Exception as e:
                print(f"Error processing {image_name}: {e}")

    # Save to CSV
    hist_df = pd.DataFrame(histogram_features)
    hist_df.insert(0, 'image_name', image_names)
    hist_df.to_csv(output_csv, index=False)
    print(f"Histogram features saved to {output_csv}")

input_folder = 'train_new_ims' 
output_file = 'histogram_features_color.csv' 

extract_histogram_features(image_folder=input_folder, output_csv=output_file, bins=8, exclude_keyword="_grayscale")


In [4]:
import pandas as pd

def combine_hog_and_histogram_with_hist_image_name(hog_csv, hist_csv, output_csv):
    hog_df = pd.read_csv(hog_csv)
    hist_df = pd.read_csv(hist_csv)

    hog_df['image_code'] = hog_df['image_name'].str.split('_').str[0] #get the first part of their name
    hist_df['image_code'] = hist_df['image_name'].str.split('_').str[0] #get the first part of their name

    combined_df = pd.merge(hog_df, hist_df, on='image_code', how='inner', suffixes=('_hog', '_hist'))

    combined_df['image_name'] = combined_df['image_name_hist'] #use the histogram image name
    combined_df = combined_df.drop(columns=['image_name_hog', 'image_name_hist'])

    cols = ['image_name'] + [col for col in combined_df.columns if col != 'image_name']
    combined_df = combined_df[cols]

    # Save the combined dataframe to a CSV file
    combined_df.to_csv(output_csv, index=False)
    print(f"Combined features saved to {output_csv}")
    return combined_df

hog_csv = 'hog_features_grayscale.csv'
hist_csv = 'histogram_features_color.csv'
output_csv = 'combined_features.csv'

combined_features_df = combine_hog_and_histogram_with_hist_image_name(hog_csv, hist_csv, output_csv)



In [5]:
combined_df = pd.read_csv('combined_features.csv')
print(combined_df.columns)
print(combined_df.head(2))

hog_csv = pd.read_csv('hog_features_grayscale.csv')
hist_csv = pd.read_csv('histogram_features_color.csv')
print("_____________")
print(hog_csv.columns)
print(hog_csv.head(5))
print("_____________")
print(hist_csv.columns)
print(hist_csv.head(5))

Index(['image_name', '0_hog', '1_hog', '2_hog', '3_hog', '4_hog', '5_hog',
       '6_hog', '7_hog', '8_hog',
       ...
       '14_hist', '15_hist', '16_hist', '17_hist', '18_hist', '19_hist',
       '20_hist', '21_hist', '22_hist', '23_hist'],
      dtype='object', length=350)
              image_name     0_hog     1_hog     2_hog     3_hog     4_hog  \
0  9e1d819_augmented.jpg  0.139699  0.079391  0.038862  0.090377  0.253900   
1  9e1d819_augmented.jpg  0.165667  0.255557  0.058470  0.104116  0.095903   

      5_hog     6_hog     7_hog     8_hog  ...   14_hist   15_hist   16_hist  \
0  0.217409  0.253900  0.069572  0.083052  ...  0.000977  0.000977  0.181641   
1  0.039462  0.015208  0.037321  0.027881  ...  0.000977  0.000977  0.181641   

    17_hist   18_hist   19_hist   20_hist   21_hist   22_hist   23_hist  
0  0.112305  0.143555  0.182617  0.166992  0.108398  0.083984  0.020508  
1  0.112305  0.143555  0.182617  0.166992  0.108398  0.083984  0.020508  

[2 rows x 350 columns]

In [6]:
print(pd.read_csv("hog_features_grayscale.csv").isna().sum().sum())
print(pd.read_csv("histogram_features_color.csv").isna().sum().sum())


0
0


In [7]:
# Combined features
combined_df = pd.read_csv("combined_features.csv")
combined_df['image_code'] = combined_df['image_name'].str.split('_').str[0]
print("Combined Features Image Codes:")
print(combined_df['image_code'].head())

# Training labels
train_df = pd.read_csv("train.csv")
train_df['image_code'] = train_df['im_name'].str.split('.').str[0]
print("\nTraining Labels Image Codes:")
print(train_df['image_code'].head())


Combined Features Image Codes:
0    9e1d819
1    9e1d819
2    7db1b9e
3    7db1b9e
4    b877c16
Name: image_code, dtype: object

Training Labels Image Codes:
0    00016cd
1    0001808
2    0002399
3    0003973
4    00061cc
Name: image_code, dtype: object


In [8]:
def load_combined_features_for_training(combined_csv, train_csv):

    combined_df = pd.read_csv(combined_csv)
    # Load the labels
    train_df = pd.read_csv(train_csv)

    # Extract first part of image name for matching
    combined_df['image_code'] = combined_df['image_name'].str.split('_').str[0]
    train_df['image_code'] = train_df['im_name'].str.split('.').str[0]

    # Merge combined features with labels
    matched_df = pd.merge(combined_df, train_df, on='image_code', how='inner')

    X = matched_df.drop(columns=['image_name', 'image_code', 'im_name', 'label']).values
    y = matched_df['label'].values

    print(f"Matched {len(matched_df)} features with labels.")
    return X, y

X, y = load_combined_features_for_training("combined_features.csv", "train.csv")


Matched 100000 features with labels.


In [9]:
print(X.shape, y.shape)


(100000, 348) (100000,)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [11]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(80000, 348) (20000, 348) (80000,) (20000,)


In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

clf = SGDClassifier(loss='log_loss', max_iter=10000, tol=1e-3, random_state=42)
#log_loss = logistic regression, hinge = linear SVM, 
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.4715


In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score

param_grid = {
    'loss': ['hinge', 'log_loss'],  
    'penalty': ['l2', 'elasticnet'],  
    'alpha': [1e-4, 1e-3],  
    'learning_rate': ['constant', 'adaptive'],  
    'eta0': [0.01, 0.1], 
    'max_iter': [500],  
}

grid = ParameterGrid(param_grid)

best_params = None
best_accuracy = 0

for params in grid:
    print(f"Testing parameters: {params}")
    clf = SGDClassifier(random_state=42, **params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)  
    acc = accuracy_score(y_test, y_pred)  

    print(f"Accuracy: {acc:.4f}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_params = params

print(f"\nBest Parameters: {best_params}")
print(f"Best Test Accuracy: {best_accuracy:.4f}")


Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2'}
Accuracy: 0.4099
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'elasticnet'}
Accuracy: 0.4137
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'log_loss', 'max_iter': 500, 'penalty': 'l2'}
Accuracy: 0.4692
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'log_loss', 'max_iter': 500, 'penalty': 'elasticnet'}
Accuracy: 0.4688
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2'}
Accuracy: 0.4322
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'adaptive', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'elasticnet'}
Accuracy: 0.4226
Testing parameters: {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'adapti