Are the cats and dogs well separated, i.e. can you obtain good classification test accuracy performance on this data set? Compare at least 3 classifiers.

Are there any images that are consistently mislabeled by the classifiers (use resampling to ascertain)? Why do you think these are difficult images to classify? Do the classifiers struggle with the same observations?

Are the errors balanced or is one class more difficult to classify correctly?

In [12]:
import os
import numpy as np
import matplotlib as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def read_data_file(filename: str) -> pd.DataFrame:
    path = Path(os.getcwd() + 'q1.ipynb')
    data_folder = str(path.parent.absolute()) + '/data/'
    return pd.read_csv(data_folder + filename)


features = read_data_file('CATSnDOGS.csv') / 255 # Rescale features to values between 0 and 1
labels = read_data_file('Labels.csv')

models = [
          ('SVM', SVC()),
          ('LogReg', LogisticRegression(max_iter=1000)), 
          ('RF', RandomForestClassifier())
        ]

In [22]:
# rename
def evaluate_ensemble(models: list, features: pd.DataFrame, labels: pd.DataFrame) -> pd.DataFrame:
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, stratify=labels, random_state=None)
    dfs = []
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    for name, model in models:
        kfold = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
        cross_validate(model, X_train, y_train.to_numpy().ravel(), cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train.to_numpy().ravel())
        y_pred = clf.predict(X_test)
        
        df = pd.DataFrame({'_id': y_test.index.values, 'true': y_test.to_numpy().ravel(), 'predicted': y_pred, 'model': name})
        dfs.append(df)

    result = pd.concat(dfs, ignore_index=True)
    return result


# want: mean and std for each scoring, for each model over iterations
def average_scores(models: list, features: pd.DataFrame, labels: pd.DataFrame, iter: int=5):
    matrix = []
    for it in range(iter):
        res = evaluate_ensemble(models, features, labels)
        scores = calc_scores(res)
        matrix.append(scores)
    
    mean = np.mean(matrix, axis=0)
    std = np.std(matrix, axis=0)
    return mean, std

    
# want: score for each scoring for each model
def calc_scores(ensemble_result: pd.DataFrame) -> np.ndarray:
    model_names = ensemble_result.model.unique()
    scores = np.zeros((len(model_names), 3))

    for i, name in enumerate(model_names):
        this_df = ensemble_result[ensemble_result.model == name]
        y_true = this_df['true']
        y_pred = this_df['predicted']

        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        cat_accuracy = tn / (tn + fp) # specificity
        dog_accuracy = tp / (tp + fn) # recall
        accuracy = (tn + tp)/(tn + fp + fn + tp)
        scores[i][0] = cat_accuracy
        scores[i][1] = dog_accuracy
        scores[i][2] = accuracy

    return scores


def detect_mislabels(models: list, features: pd.DataFrame, labels: pd.DataFrame, iter: int = 5, vote_limit: float = 0.5) -> dict:  
    dfs = []
    mislabels = {}

    for it in range(iter):
        res = evaluate_ensemble(models, features, labels)
        res['vote'] = np.absolute(res.true-res.predicted)
        dfs.append(res)

    df = pd.concat(dfs, ignore_index=True)
    model_names = df.model.unique()

    for i, name in enumerate(model_names):
        mislabelled_ids = []
        model_df = df[df.model == name]
        _ids = model_df._id.unique()

        for _id in _ids:
            _id_df = model_df[model_df._id == _id]
            total = len(_id_df.index)
            vote = _id_df['vote'].sum()
            if vote/total < vote_limit:
                continue
            else:
                mislabelled_ids.append(_id)

        mislabels[name] = mislabelled_ids

    return mislabels


mean, std = average_scores(models, features, labels)
mislabels = detect_mislabels(models, features, labels)

In [29]:
mislabels_svm = set(mislabels['SVM'])
mislabels_logreg = set(mislabels['LogReg'])
mislabels_rf = set(mislabels['RF'])

intersection = mislabels_svm.intersection(mislabels_logreg).intersection(mislabels_rf)
print(intersection)
print("%s \% of mislabels by the SVM is shared by all models." % len(intersection)/len(mislabels_svm)*100)

{8, 139, 14, 21, 151, 26, 158, 37, 168, 176, 180, 53, 54, 58, 63, 192, 78, 79, 85, 90, 93, 96, 113, 114, 124}


TypeError: not enough arguments for format string