For a SIFT + Bag of Visual Words + Classifier pipeline, we follow these steps:
1. Generate SIFT descriptors for each image.
2. Cluster the SIFT descriptors to get visual vocabulary. Cluster centers are the vocabulary.
3. Use the clustering model to predict cluster labels for each descriptor for each image. 
4. Get normalized histograms of the cluster labels for each image. This gives us a normalized count of the number of visual words that are present in the image. 
5. Use the histogram and the labels to build a classifier.

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from src.data_utils.dataset import Dataset
from src.config.config import RANDOM_STATE
# from src.clustering.cluster import run_clustering_pipeline
from scipy.spatial.distance import cdist
from scipy.linalg import norm
from src.classification.classify import run_classification_pipeline

# Clustering

In [2]:
train_ds = Dataset("train")
train_descriptors, train_suits, train_nums = train_ds.load_descriptors()

In [3]:
# Using dataframes to leverage groupbys
train_df = pd.DataFrame(
    {
        "suits": [arr[0, 0].astype("str") for arr in train_suits], 
        "numbers": [arr[0, 0].astype("str") for arr in train_nums]
    }
)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7509 entries, 0 to 7508
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   suits    7509 non-null   object
 1   numbers  7509 non-null   object
dtypes: object(2)
memory usage: 117.5+ KB


## Modelling suits

In [None]:
train_df["suits"].value_counts()

suits
spades      2031
hearts      1845
diamonds    1827
clubs       1806
Name: count, dtype: int64

Since clubs have the lowest count, we take 1806 samples from each class. 

In [5]:
num_samples_per_class = 1806
train_mask = train_df.groupby(
    "suits"
).sample(
    num_samples_per_class, 
    random_state=RANDOM_STATE
).index.values

In [6]:
train_descriptors_filtered = [train_descriptors[i] for i in train_mask]
train_suits_filtered = [train_suits[i] for i in train_mask]

In [7]:
normalized_descriptors = np.vstack(train_descriptors_filtered)
normalized_descriptors = normalized_descriptors / np.linalg.norm(normalized_descriptors, axis=1, keepdims=True)

In [8]:
# kmeans_models = []
# training_metrics = []

# for k in range(100, 501, 50):
#     pipeline = Pipeline(
#         [
#             ("kmeans", MiniBatchKMeans(n_clusters=k, random_state=RANDOM_STATE))
#         ]
#     )

#     model, metrics =  run_clustering_pipeline(
#         np.vstack(train_descriptors_filtered), 
#         np.vstack(train_suits_filtered), 
#         pipeline, 
#         model_params=None, 
#         experiment_name="clustering"
#     )

#     kmeans_models.append(model)
#     training_metrics.append(metrics)

To improve model training speeds, a cuml KMeans model with 40 clusters was trained on a cloud GPU. This model was then converted into an sklearn KMeans model by manually assigning the cluster centers from the trained cuml model to an instance of an sklearn KMeans model, to use on a CPU. 

After getting the cluster centers, we now create the histogram for each image. 

In [8]:
kmeans_model = joblib.load("../models_v2/kmeans_classifier.pkl")
kmeans_model

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
cluster_centers = kmeans_model.cluster_centers_
cluster_centers.shape

(40, 128)

In [10]:
def compute_bovw_histogram(descriptors, cluster_centers):
    # Using cdist instead of kmeans.predict for improved speed. 
    distances = cdist(descriptors, cluster_centers, metric="euclidean")
    labels = np.argmin(distances, axis=1)

    # Labels will have the same length as number of descriptors, 1 for 
    # each descriptor of the image. 
    hist, _ = np.histogram(labels, bins=np.arange(len(cluster_centers)))
    normalized_hist = hist / norm(hist)

    return normalized_hist

In [11]:
val_ds = Dataset("val")
val_descriptors, val_suits, val_nums = val_ds.load_descriptors()

In [12]:
X_train_histograms = [compute_bovw_histogram(desc_array, cluster_centers) for desc_array in train_descriptors_filtered]
X_train_labels = [suit_array[0][0].decode() for suit_array in train_suits_filtered]    
X_train = np.array(X_train_histograms)
y_train = np.array(X_train_labels).reshape(-1, 1)

In [13]:
X_val_histograms = [compute_bovw_histogram(desc_array, cluster_centers) for desc_array in val_descriptors]
X_val_labels = [suit_array[0][0].decode() for suit_array in val_suits]    
X_val = np.array(X_val_histograms)
y_val = np.array(X_val_labels).reshape(-1, 1)

X_val.shape, y_val.shape

((260, 39), (260, 1))

In [17]:
pipeline = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", LinearSVC(random_state=RANDOM_STATE))     
])

param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100], 
    "classifier__loss": ["hinge", "squared_hinge"]
}

grid_search = GridSearchCV(
    pipeline, 
    param_grid=param_grid, 
    cv=5, 
    scoring="accuracy", 
    verbose=2
)

In [18]:
model1, metrics1 = run_classification_pipeline(
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    grid_search, 
    model_params=None, 
    experiment_name="classification"
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........classifier__C=0.01, classifier__loss=hinge; total time=   0.2s
[CV] END .........classifier__C=0.01, classifier__loss=hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........classifier__C=0.01, classifier__loss=hinge; total time=   0.1s
[CV] END .........classifier__C=0.01, classifier__loss=hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........classifier__C=0.01, classifier__loss=hinge; total time=   0.1s
[CV] END .classifier__C=0.01, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .classifier__C=0.01, classifier__loss=squared_hinge; total time=   0.2s
[CV] END .classifier__C=0.01, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END .classifier__C=0.01, classifier__loss=squared_hinge; total time=   0.2s
[CV] END .classifier__C=0.01, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=0.1, classifier__loss=hinge; total time=   0.2s
[CV] END ..........classifier__C=0.1, classifier__loss=hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=0.1, classifier__loss=hinge; total time=   0.1s
[CV] END ..........classifier__C=0.1, classifier__loss=hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=0.1, classifier__loss=hinge; total time=   0.2s
[CV] END ..classifier__C=0.1, classifier__loss=squared_hinge; total time=   0.1s
[CV] END ..classifier__C=0.1, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ..classifier__C=0.1, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..classifier__C=0.1, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ..classifier__C=0.1, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ............classifier__C=1, classifier__loss=hinge; total time=   0.7s


  y = column_or_1d(y, warn=True)


[CV] END ............classifier__C=1, classifier__loss=hinge; total time=   1.2s


  y = column_or_1d(y, warn=True)


[CV] END ............classifier__C=1, classifier__loss=hinge; total time=   0.9s


  y = column_or_1d(y, warn=True)


[CV] END ............classifier__C=1, classifier__loss=hinge; total time=   1.1s


  y = column_or_1d(y, warn=True)


[CV] END ............classifier__C=1, classifier__loss=hinge; total time=   1.2s
[CV] END ....classifier__C=1, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END ....classifier__C=1, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END ....classifier__C=1, classifier__loss=squared_hinge; total time=   0.3s


  y = column_or_1d(y, warn=True)


[CV] END ....classifier__C=1, classifier__loss=squared_hinge; total time=   0.3s


  y = column_or_1d(y, warn=True)


[CV] END ....classifier__C=1, classifier__loss=squared_hinge; total time=   3.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ...........classifier__C=10, classifier__loss=hinge; total time=   3.5s


  y = column_or_1d(y, warn=True)


[CV] END ...........classifier__C=10, classifier__loss=hinge; total time=   3.1s


  y = column_or_1d(y, warn=True)


[CV] END ...........classifier__C=10, classifier__loss=hinge; total time=   4.3s


  y = column_or_1d(y, warn=True)


[CV] END ...........classifier__C=10, classifier__loss=hinge; total time=   3.6s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ...........classifier__C=10, classifier__loss=hinge; total time=   3.0s
[CV] END ...classifier__C=10, classifier__loss=squared_hinge; total time=   0.1s
[CV] END ...classifier__C=10, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ...classifier__C=10, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ...classifier__C=10, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ...classifier__C=10, classifier__loss=squared_hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=100, classifier__loss=hinge; total time=   4.2s


  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=100, classifier__loss=hinge; total time=   4.2s


  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=100, classifier__loss=hinge; total time=   4.9s


  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=100, classifier__loss=hinge; total time=   8.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..........classifier__C=100, classifier__loss=hinge; total time=   4.4s
[CV] END ..classifier__C=100, classifier__loss=squared_hinge; total time=   0.1s
[CV] END ..classifier__C=100, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ..classifier__C=100, classifier__loss=squared_hinge; total time=   0.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ..classifier__C=100, classifier__loss=squared_hinge; total time=   0.2s
[CV] END ..classifier__C=100, classifier__loss=squared_hinge; total time=   0.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
2024/12/19 20:00:18 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


In [25]:
model1.best_params_

{'classifier__C': 1, 'classifier__loss': 'hinge'}

In [20]:
metrics1

{'train_acccuracy': 0.6680509413067552,
 'train_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs           1341            113          195          359
 pred_diamonds         100           1354          329          205
 pred_hearts           179            221         1124          235
 pred_spades           186            118          158         1007,
 'validation_accuracy': 0.5961538461538461,
 'validation_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs             52              4           10           24
 pred_diamonds           3             42           14            8
 pred_hearts             6             11           33            5
 pred_spades             4              8            8           28}

In [26]:
pipeline2 = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", OneVsRestClassifier(LinearSVC(random_state=RANDOM_STATE)))
])

model2, metrics2 = run_classification_pipeline(
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    pipeline2, 
    model_params=None, 
    experiment_name="classification"
)



In [27]:
metrics2

{'train_acccuracy': 0.6655592469545958,
 'train_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs           1303            106          175          335
 pred_diamonds          91           1343          318          193
 pred_hearts           193            234         1143          259
 pred_spades           219            123          170         1019,
 'validation_accuracy': 0.5923076923076923,
 'validation_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs             52              4            8           20
 pred_diamonds           2             42           14           10
 pred_hearts             6             11           33            8
 pred_spades             5              8           10           27}

Linear models seem to be performing poorly. Let us inspect the data to try and see if the decision boundaries are potentially non-linear. For this, we will reduce the dimensionality using PCA and t-SNE, and then plot the data. 

In [50]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import plotly.express as px

Using PCA

In [42]:
pca = PCA(n_components=3)
X_train_3d = pca.fit_transform(X_train)

2024/12/19 20:24:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '479fc2ed4fc347059bf1e76582707cfc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [55]:
df = pd.DataFrame(X_train_3d, columns=["pc1", "pc2", "pc3"])
df["class"] = LabelEncoder().fit_transform(y_train.reshape(-1, ))

In [58]:
fig = px.scatter_3d(df, x="pc1", y="pc2", z="pc3", color="class")
fig.show()

In 3 dimensions, the classification boundary seems to be non-linear, and we may benefit from using a non-linear kernel. 

In [59]:
from sklearn.svm import SVC

In [60]:
pipeline4 = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", SVC(kernel="poly", random_state=RANDOM_STATE))
])

model4, metrics4 = run_classification_pipeline(
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    pipeline4, 
    model_params=None, 
    experiment_name="classification"
)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [61]:
metrics4

{'train_acccuracy': 0.8463455149501661,
 'train_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs           1555             66          103          143
 pred_diamonds          41           1539           92           59
 pred_hearts            97            122         1533          117
 pred_spades           113             79           78         1487,
 'validation_accuracy': 0.6730769230769231,
 'validation_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs             50              4            7           17
 pred_diamonds           2             53           11            6
 pred_hearts             2              6           40           10
 pred_spades            11              2            7           32}

Since both the training and validation accuracy have gone up significantly, we can say that the decision boundary is non-linear. 

In [62]:
pipeline5 = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", SVC(random_state=RANDOM_STATE))
])

param_grid5 = {
    "classifier__C": [0.01, 0.1, 1, 10, 100], 
    "classifier__kernel": ["poly", "rbf"], 
    "classifier__degree": [2, 4, 6, 8, 10]
}

grid_search5 = GridSearchCV(
    pipeline5, 
    param_grid=param_grid5, 
    scoring="accuracy", 
    n_jobs=3, 
    cv=5
)

model5, metrics5 = run_classification_pipeline(
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    grid_search5, 
    model_params=None, 
    experiment_name="classification"
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [70]:
metrics5

{'train_acccuracy': 0.8158914728682171,
 'train_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs           1478             54           99          164
 pred_diamonds          60           1542          144           90
 pred_hearts           130            123         1460          138
 pred_spades           138             87          103         1414,
 'validation_accuracy': 0.7269230769230769,
 'validation_confusion_matrix':                true_clubs  true_diamonds  true_hearts  true_spades
 pred_clubs             56              2            4           13
 pred_diamonds           0             50           11            5
 pred_hearts             3              7           44            8
 pred_spades             6              6            6           39}

In [69]:
model5.best_estimator_.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('classifier', SVC(C=1, degree=2, random_state=0))],
 'verbose': False,
 'scaler': StandardScaler(),
 'classifier': SVC(C=1, degree=2, random_state=0),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'classifier__C': 1,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 2,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'rbf',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 0,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}