Are the cats and dogs well separated, i.e. can you obtain good classification test accuracy performance on this data set? Compare at least 3 classifiers.

Are there any images that are consistently mislabeled by the classifiers (use resampling to ascertain)? Why do you think these are difficult images to classify? Do the classifiers struggle with the same observations?

Are the errors balanced or is one class more difficult to classify correctly?

In [52]:
import os
import numpy as np
import matplotlib as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

def read_data_file(filename: str) -> pd.DataFrame:
    path = Path(os.getcwd() + 'q1.ipynb')
    data_folder = str(path.parent.absolute()) + '/data/'
    return pd.read_csv(data_folder + filename)


features = read_data_file('CATSnDOGS.csv') / 255 # Rescale features to values between 0 and 1
labels = read_data_file('Labels.csv')

In [54]:
def compare_ensemble(models: list(), features: pd.DataFrame, labels: pd.DataFrame) -> pd.DataFrame:
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
    dfs = []
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    target_names = ['cat', 'dog']

    for name, model in models:
        kfold = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
        cv_results = cross_validate(model, X_train, y_train.to_numpy().ravel(), cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train.to_numpy().ravel())
        y_pred = clf.predict(X_test)
        
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))

        df = pd.DataFrame(cv_results)
        df['model'] = name
        dfs.append(df)

    result = pd.concat(dfs, ignore_index=True)
    return result


# Fix so that LogisticRegression is used with optimal parameter? Any other settings to improve SVC, RandomForest?
models = [
          ('SVM', SVC()),
          ('LogReg', LogisticRegression(max_iter=1000)), 
          ('RF', RandomForestClassifier())
        ]

results = compare_ensemble(models, features, labels)
#print(results)

    fit_time  score_time  test_accuracy  test_precision  test_recall  \
0   0.151121    0.046213       0.823529        0.777778        0.875   
1   0.118792    0.049701       0.823529        0.857143        0.750   
2   0.129698    0.043484       0.705882        1.000000        0.375   
3   0.114759    0.052727       0.941176        1.000000        0.875   
4   0.117213    0.042294       0.812500        0.857143        0.750   
5   0.092210    0.044382       0.625000        1.000000        0.250   
6   0.112166    0.049823       0.625000        0.666667        0.500   
7   0.101025    0.045671       0.937500        1.000000        0.875   
8   0.137376    0.044211       0.937500        0.888889        1.000   
9   0.465667    0.026216       0.764706        0.700000        0.875   
10  0.472822    0.027593       0.764706        0.750000        0.750   
11  0.424196    0.034567       0.529412        0.500000        0.375   
12  0.484139    0.037761       0.941176        0.888889        1