`imageclef2011` image features, the set of all
labels in `concepts_2011.txt` and the image labels `trainset_gt_annotations.txt`

In [1]:
import os

import cv2
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
concepts = np.loadtxt('./data/concepts_2011.txt', dtype=str)
print(concepts[10:14])

[['9' 'Spring']
 ['10' 'Summer']
 ['11' 'Autumn']
 ['12' 'Winter']]


In [3]:
labelled = pd.read_csv('./data/trainset_gt_annotations.txt', sep=' ', header=None)
# COL MAP:
    # 0: file_name
    # 1 - 99: concepts
df_labels = labelled[[0, 10, 11, 12, 13]]
df_labels = df_labels[(df_labels[10] == 1) | (df_labels[11] == 1) | (df_labels[12] == 1) | (df_labels[13] == 1)]
print(df_labels.shape)
print(df_labels.head())

(1353, 5)
                                          0   10  11  12  13
16  01d88c87-f2e3-4a7f-be26-b38c549b5e7d.jpg   1   0   0   0
33  03b0bf90-f66b-41ad-b7ae-0e9f818cb9fd.jpg   0   1   0   0
34  03c6c5ed-188b-4034-a487-de10aa3aac0d.jpg   0   0   0   1
37  03e489e7-7c0f-48f3-971d-b44b7e026f25.jpg   0   1   0   0
43  040b9239-da83-462b-991e-c4f2a1898a62.jpg   0   1   0   0


### Get features

In [4]:
os.chdir("./data/imageclef2011_feats/")
npy_files = os.listdir()
file_end = "_ft.npy"

In [5]:
N = len(df_labels.index) # 1353
features = np.zeros((N, 1024))
for i in range(N):
    file_name = df_labels.iloc[i][0] + file_end
    arr = np.load(file_name)
    features[i] = arr
    
print(N)
print(features.shape)

1353
(1353, 1024)


In [6]:
df_features = pd.DataFrame(features, index=df_labels.index)
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
16,0.000487,0.005092,0.002814,0.000255,0.092691,0.143073,0.000282,0.005021,0.244339,0.000229,...,0.155491,0.033567,0.884194,1.652364,3.855961,0.05555,0.127505,2.406339,0.075314,0.061892
33,0.000299,0.006495,0.005325,0.004764,0.054378,0.913825,0.00041,0.002364,0.103398,0.000597,...,1.430287,0.596499,2.473551,2.021996,2.559144,3.585923,0.391455,0.465208,1.017973,2.098028
34,0.000334,0.009949,0.001487,0.002653,0.119458,1.603592,0.000545,0.001888,0.504032,0.000396,...,1.016409,0.552168,0.929716,0.039299,0.356691,1.456054,1.37703,1.67701,0.064636,1.076554
37,0.000495,0.004574,0.003922,0.005027,0.033131,0.376466,0.000441,0.001464,0.287925,0.000253,...,0.371879,2.791942,3.884916,0.773607,0.724978,0.555365,1.49982,1.941962,2.903649,2.013276
43,0.00015,0.006658,0.000929,0.001262,0.192029,0.387356,0.000796,0.002781,0.274344,0.000517,...,6.574651,0.735099,1.287308,0.334603,0.241196,0.0,1.480002,0.072667,0.755902,0.374753


Split the images into
- 60 percent per class for training
- 10 percent per class for validation, 
- 30 percent per class for final test. 

What is the difference to a random 60 − 10 − 30 split of the whole data as compared to split
class-wise? 

Why I asked you to split classwise ? Explain in at most 8 sentences.

idea: use a linear SVM (e.g. scikit-learn) with python3 interface. If you
use another kernel, please notify clearly in the report.

for the first experiment take only the features from those images who have
as label either spring or summer or winter or fall. that should be 1353
samples

In [7]:
def split_data(data, train=0.6, test=0.3, val=0.1):
    N = len(data)
    index = list(data.index)
    
    np.random.shuffle(index)
    
    first_partition = int(N*train)
    second_partition = int(N*train) + int(N*test)
    
    train_index = index[:first_partition]
    test_index = index[first_partition: second_partition]
    val_index = index[second_partition: ]
    
    train = data.loc[train_index]
    test = data.loc[test_index]
    val = data.loc[val_index]
    
    return train, test, val

In [8]:
seasons_map = {10: 'Spring', 11: 'Summer', 12: 'Autumn', 13: 'Winter'}

In [9]:
# covert multilabel to multiclass
# for season in seasons_map:
def label_helper(row):
    for season in seasons_map:
        if row[season] == 1:
            return season
    else:
        return None

In [10]:
labels = df_labels.apply(label_helper, axis=1)
df_features["labels"] = labels

In [12]:
df_features = df_features.reset_index(drop=True)
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,labels
0,0.000487,0.005092,0.002814,0.000255,0.092691,0.143073,0.000282,0.005021,0.244339,0.000229,...,0.033567,0.884194,1.652364,3.855961,0.05555,0.127505,2.406339,0.075314,0.061892,10
1,0.000299,0.006495,0.005325,0.004764,0.054378,0.913825,0.00041,0.002364,0.103398,0.000597,...,0.596499,2.473551,2.021996,2.559144,3.585923,0.391455,0.465208,1.017973,2.098028,11
2,0.000334,0.009949,0.001487,0.002653,0.119458,1.603592,0.000545,0.001888,0.504032,0.000396,...,0.552168,0.929716,0.039299,0.356691,1.456054,1.37703,1.67701,0.064636,1.076554,13
3,0.000495,0.004574,0.003922,0.005027,0.033131,0.376466,0.000441,0.001464,0.287925,0.000253,...,2.791942,3.884916,0.773607,0.724978,0.555365,1.49982,1.941962,2.903649,2.013276,11
4,0.00015,0.006658,0.000929,0.001262,0.192029,0.387356,0.000796,0.002781,0.274344,0.000517,...,0.735099,1.287308,0.334603,0.241196,0.0,1.480002,0.072667,0.755902,0.374753,11


### Save as `.npy`

In [13]:
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

for season in seasons_map:
    df_season = df_features[df_features["labels"] == season]
    train, test, val = split_data(df_season)
    df_train = pd.concat([df_train, train])
    df_val = pd.concat([df_val, val])
    df_test = pd.concat([df_test, test])

np.save("../df_train.npy", df_train.values)
np.save("../df_val.npy", df_val.values)
np.save("../df_test.npy", df_test.values)

## Classifier

In [14]:
def generate_xy(df, label_name, label):
    X = df.drop('labels', axis=1).values
    y = df[label_name]
    y = [1 if el == label else 0 for el in y]
    return X, y

def classwise_accuracy(y_true, y_pred):
    classes = np.unique(y_true)
    N = len(y_true)
    A = 0
    for c in classes:
        n_count = 0
        correct = 0
        for i in range(N):
            if y_true[i] == c:
                n_count += 1
                if y_pred[i] == c:
                    correct += 1
                    
        A += correct/n_count
    return A/len(classes)

def vanilla_accuracy(y_true, y_pred):
    return np.mean(y_pred == y_true)

In [15]:
constants = [0.01, 0.1, 0.1**0.5, 1., 10**0.5, 10]

for season in seasons_map:
    X_train, y_train = generate_xy(df_train, 'labels', season)
    X_val, y_val = generate_xy(df_val, 'labels', season)
    X_test, y_test = generate_xy(df_test, 'labels', season)

    best_vanilla_c = None
    best_classwise_c = None
    
    best_vanilla_acc = 0
    best_classwise_acc = 0 
    print(f"\nSeason: {seasons_map[season]}")
    for i in range(len(constants)):
        C = constants[i]
        clf = LinearSVC(C=C)
        clf.fit(X_train, y_train)
        y_val_pred = clf.predict(X_val)

        current_vanilla_acc = vanilla_accuracy(y_val_pred, y_val)
        current_classwise_acc = classwise_accuracy(y_val_pred, y_val)
        
#         print(f"Constant: {C}, V Acc: {current_vanilla_acc}, Classwise Acc: {current_classwise_acc}")
        
        if current_vanilla_acc > best_vanilla_acc:
            best_vanilla_c = C
            best_vanilla_acc = current_vanilla_acc
            
        if current_classwise_acc > best_classwise_acc:
            best_classwise_c = C
            best_classwise_acc = current_classwise_acc

    print(f"Vanilla")
    print(f"Best Constant: {best_vanilla_c}, Acc: {best_vanilla_acc}")
    print(f"Classwise")
    print(f"Best Constant: {best_classwise_c}, Acc: {best_classwise_acc}")


Season: Spring
Vanilla
Best Constant: 0.01, Acc: 0.9136690647482014
Classwise
Best Constant: 0.01, Acc: 0.6664179104477612

Season: Summer
Vanilla
Best Constant: 0.01, Acc: 0.7194244604316546
Classwise
Best Constant: 0.01, Acc: 0.6940898345153664

Season: Autumn
Vanilla
Best Constant: 0.01, Acc: 0.9064748201438849
Classwise
Best Constant: 0.01, Acc: 0.8112403100775194

Season: Winter
Vanilla
Best Constant: 0.01, Acc: 0.8561151079136691
Classwise
Best Constant: 0.01, Acc: 0.7284044715447154


In [None]:
constants = [0.01, 0.1, 0.1**0.5, 1., 10**0.5, 10]

for season in seasons_map:
    X_train, y_train = generate_xy(df_train, 'labels', season)
    X_val, y_val = generate_xy(df_val, 'labels', season)
    X_test, y_test = generate_xy(df_test, 'labels', season)

    best_vanilla_c = None
    best_classwise_c = None
    
    best_vanilla_acc = 0
    best_classwise_acc = 0  
    for i in range(len(constants)):
        C = constants[i]
        clf = LinearSVC(C=C)
        clf.fit(X_train, y_train)
        y_val_pred = clf.predict(X_val)

        current_vanilla_acc = vanilla_accuracy(y_val_pred, y_val)
        current_classwise_acc = classwise_accuracy(y_val_pred, y_val)
        
        if current_vanilla_acc > best_vanilla_acc:
            best_vanilla_c = C
            best_vanilla_acc = current_vanilla_acc
            
        if current_classwise_acc > best_classwise_acc:
            best_classwise_c = C
            best_vanilla_acc = current_vanilla_acc

    print(f"Season: {seasons_map[season]}")
    clf = LinearSVC(C=best_c)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = np.mean(y_pred == y_test)
    
    print(f"Best C: {round(best_c, 3)}, Val. Acc: {best_acc}, Test Acc: {acc}")

### Classwise Accuracy 

$$
\begin{aligned} A &=\frac{1}{C} \sum_{c=1}^{C} a_{c} \\ a_{c} &=\frac{1}{\sum_{i=1}^{n} 1\left[y_{i}==c\right]} \sum_{i=1}^{n} 1\left[y_{i}==c\right] 1\left[f\left(x_{i}\right)==c\right] \end{aligned}
$$

That results in a multi-class dataset with mutually exclusive labels. Thus
you can train 4 binary svms, one for each class in one-vs-all manner. Each
svm is trained on the training dataset using all the training data.

This method has one free parameter - the regularization constant. 

Find the best regularization constant from the set $0.01,0.1,0.1^{0.5}, 1,10^{0.5}, 10,100^{0.5}$
by repeatedly training on the training set and measuring performance on
the validation set. Use as performance measure the class-wise accuracy
averaged over all 4 classes.

Split the images into 
- 60 percent per class for training, 
- 10 percent per class for validation, 
- 30 percent per class for final test. 

What is the difference
to a random 60 − 10 − 30 split of the whole data as compared to split
class-wise? Why I asked you to split classwise ? Explain in at most 8
sentences.