# SVM Classification with Features Extracted by CNNs
In this notebook, we train an SVM to check whether one satellite image contains the other. We use the extracted features of the images using the pre-trained [MobileNet](https://keras.io/applications/#mobilenet) implementation.

### Load the features

In [1]:
import pickle
import glob
import csv
import numpy as np
from config import CSV_PATH, FEATURE_PATH

with open(FEATURE_PATH,'br') as f:
    feature_dict = pickle.load(f)
img_idx, preds = feature_dict['img_idx'], feature_dict['preds']

y = []
X = []

csv_paths = glob.glob(CSV_PATH+'*.csv')
for path in csv_paths:
    with open(path,'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 4:
                y.append(int(row[3]))
                x = np.concatenate((preds[img_idx[row[1]]],preds[img_idx[row[2]]]))
                X.append(x)
y = np.array(y)
X = np.array(X)

print('Number of samples:',X.shape[0])
print()

print('Distribution of 0s and 1s:')
print('0\t',1-np.sum(y)/y.shape[0])
print('1\t',np.sum(y)/y.shape[0])

Number of samples: 3000

Distribution of 0s and 1s:
0	 0.5
1	 0.5


### Split the dataset into training and test sets

In [2]:
from sklearn.model_selection import train_test_split

# Use one fifth of the dataset as the test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123457)

### Train an SVM using Cross-Validated Grid-Search

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2e-3] }]

scores = ['precision', 'recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score, n_jobs=-1)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
clf_ = clf

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'gamma': 0.002, 'kernel': 'rbf'}

Grid scores on development set:

0.882 (+/-0.029) for {'gamma': 0.002, 'kernel': 'rbf'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       297
           1       0.84      0.97      0.90       303

   micro avg       0.89      0.89      0.89       600
   macro avg       0.90      0.89      0.89       600
weighted avg       0.90      0.89      0.89       600


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'gamma': 0.002, 'kernel': 'rbf'}

Grid scores on development set:

0.870 (+/-0.036) for {'gamma': 0.002, 'kernel': 'rbf'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the f

# Modelling Human Capability of Matching
In this case, we train an SVM to capture human capabilities for checking if an image contains another image or not.

In [21]:
import csv
import numpy as np

V = 'C'

if V == 'A':
    from config import TRAIN_PATH as TRAIN_PATH
    from config import TEST_PATH as TEST_PATH
    b,s,t = 9,5,6
elif V == 'B':
    from config import TRAINB_PATH as TRAIN_PATH
    from config import TESTB_PATH as TEST_PATH
    b,s,t = 62,5,6
elif V == 'C':
    from config import TRAINC_PATH as TRAIN_PATH
    from config import TESTC_PATH as TEST_PATH
    b,s,t = 62,5,6

def load_from_csv(path):
    y = []
    X = []
    with open(path,'r') as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            y.append(int(row[b]))
            x = np.concatenate((preds[img_idx[row[s]]],preds[img_idx[row[t]]]))
            X.append(x)
            
    y = np.array(y)
    X = np.array(X)
    
    return X,y

X_train_, y_train_ = load_from_csv(TRAIN_PATH)
X_test_, y_test_ = load_from_csv(TEST_PATH)

print('Number of training samples:',X_train_.shape[0])
print()

print('Distribution of 0s and 1s :')
print('0\t',1-np.sum(y_train_)/y_train_.shape[0])
print('1\t',np.sum(y_train_)/y_train_.shape[0])

### Split the dataset into training and test sets
# Use one fifth of the dataset as the test set 
# X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.2)

# from utils import get_pcomp
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_ = scaler.fit_transform(X_train_)
# X_test_ = scaler.transform(X_test_)

# pcomp = get_pcomp(X_train_)  # Threshold is t=0.99999 (1-eps)
# print('Number of PCA components: ',pcomp)
# pca = PCA(n_components=pcomp)
# X_train_ = pca.fit_transform(X_train_)
# X_test_ = pca.transform(X_test_)

Number of training samples: 1200

Distribution of 0s and 1s :
0	 0.37
1	 0.63


### Train an SVM using Cross-Validated Grid-Search

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000, 10000]}]

scores = ['precision', 'recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score, n_jobs=-1)
    clf.fit(X_train_, y_train_)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test_, clf.predict(X_test_)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyper-parameters for precision





Best parameters set found on development set:

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.617 (+/-0.391) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.655 (+/-0.143) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.597 (+/-0.307) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.315 (+/-0.001) for {'C': 1, 'gamma': 1e-05, 'kernel': 'rbf'}
0.616 (+/-0.332) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.573 (+/-0.058) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.616 (+/-0.126) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.593 (+/-0.307) for {'C': 10, 'gamma': 1e-05, 'kernel': 'rbf'}
0.616 (+/-0.332) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.578 (+/-0.054) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.553 (+/-0.055) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.605 (+/-0.112) for {'C': 100, 'gamma': 1e-05, 'kernel': 'rbf'}
0.616 (+/-0.332) for {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.578 (+/-0.054) for {'C': 1000, 'gamma': 0.

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

tuned_parameters = [{'hidden_layer_sizes' : [(64,64)]}]

# (i*10+50,) for i in range(1,5)]
    
scores = ['f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=5, scoring='%s_macro' % score, n_jobs=-1)
    clf.fit(X_train_, y_train_)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test_, clf.predict(X_test_)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'hidden_layer_sizes': (64, 64)}

Grid scores on development set:

0.556 (+/-0.051) for {'hidden_layer_sizes': (64, 64)}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.43      0.41      0.42       109
           1       0.67      0.69      0.68       191

   micro avg       0.59      0.59      0.59       300
   macro avg       0.55      0.55      0.55       300
weighted avg       0.58      0.59      0.58       300


