In [None]:
import numpy as np
# load the training features and labels
training_features = np.load('models/training_features-3.npy')
training_labels = np.load('models/training_labels-3.npy')


# compute the class distribution of the entire training set
training_class_distribution = np.bincount(training_labels)

# compare the class distributions
print('Class distribution of the entire training set: ', training_class_distribution)

# visualize plots of the class distributions
import matplotlib.pyplot as plt
plt.bar(np.arange(len(training_class_distribution)), training_class_distribution, label='Training set')
plt.legend()
plt.show()


The data that have labels 2, 3, 4 and 5 especially need to be augmented. The data that have labels 0 and 1 are already well represented in the training set.

In [45]:
from sklearn.utils import resample

majority_class_count = np.max(training_class_distribution)

upsampled_training_features = []
upsampled_training_labels = []

scaling_factor_min = 0.1
scaling_factor_max = 0.3

for label, count in enumerate(training_class_distribution):
    scaling_factor = scaling_factor_min + (scaling_factor_max - scaling_factor_min) * ((count - 1) / (majority_class_count - 1))
    class_indices = np.where(training_labels == label)[0]
    desired_class_size = int(np.round((majority_class_count - 1) * scaling_factor)) + 1
    oversampled_class_indices = resample(class_indices, replace=True, n_samples=desired_class_size, random_state=42)

    upsampled_training_features.append(training_features[oversampled_class_indices])
    upsampled_training_labels.append(training_labels[oversampled_class_indices])

upsampled_training_features = np.concatenate(upsampled_training_features)
upsampled_training_labels = np.concatenate(upsampled_training_labels)


Choose a suitable dataset for grid search

In [None]:
subset_size = 0.2
# split the training data into a subset and the remaining data
subset_indices = np.random.choice(len(training_features), int(subset_size * len(training_features)), replace=False)
subset_features = training_features[subset_indices]

# compute the class distribution of the subset
subset_labels = training_labels[subset_indices]
subset_class_distribution = np.bincount(subset_labels)

# compute the class distribution of the entire training set
training_class_distribution = np.bincount(training_labels)

# compare the class distributions
print('Class distribution of the entire training set: ', training_class_distribution)
print('Class distribution of the subset: ', subset_class_distribution)

# visualize plots of the class distributions
import matplotlib.pyplot as plt
plt.bar(np.arange(len(training_class_distribution)), training_class_distribution, label='Training set')
plt.bar(np.arange(len(subset_class_distribution)), subset_class_distribution, label='Subset')
plt.legend()
plt.show()

Perform grid search

In [None]:
from datetime import datetime
from sklearn.svm import SVC

class_weights = dict(
    zip(
        range(len(training_class_distribution)),
        max(training_class_distribution) / training_class_distribution
    )
)

# SVM classification
parameters = {
    'C': [ 1, 50, 75],
    'gamma': [1, 0.01, 0.001 , 'scale'],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'class_weight': ['balanced', None, class_weights]
}

print(class_weights)
# Construct the final SVM
final_svm_classifier = SVC(class_weight=class_weights)
# Perform grid search to find the best parameters
from sklearn.model_selection import GridSearchCV
print('Begin grid search' + ' - ' + str(datetime.now()))
grid_search = GridSearchCV(final_svm_classifier, parameters, cv=5)
print('Training SVM to find the best hyperparameters' + ' - ' + str(datetime.now()))
grid_search.fit(subset_features, subset_labels)
print('Finished looking for best hyperparameters' + ' - ' + str(datetime.now()))
print('Best hyperparameters: ', grid_search.best_params_)


Train the final SVM

In [None]:
# Use the best model for prediction
final_svm_classifier = grid_search.best_estimator_
# use the already existing model
print('Begin training final SVM model' + ' - ' + str(datetime.now()))
final_svm_classifier.fit(training_features, training_labels)
print('Finished training final SVM model' + ' - ' + str(datetime.now()))

Predict with the final SVM

In [None]:
all_test_features = np.load('models/test_features-3.npy')
test_labels = np.load('models/test_labels-3.npy')

# compute the class distribution of the test set
test_class_distribution = np.bincount(test_labels)

# compare the class distributions
print('Class distribution of the test set: ', test_class_distribution)

# visualize plots of the class distributions
import matplotlib.pyplot as plt
plt.bar(np.arange(len(test_class_distribution)), test_class_distribution, label='Test set')
plt.legend()
plt.show()



In [None]:
# perform oversampling on the test set
majority_test_class_count = np.max(test_class_distribution)

upsampled_test_features = []
upsampled_test_labels = []

for label, count in enumerate(test_class_distribution):
    scaling_factor = scaling_factor_min + (scaling_factor_max - scaling_factor_min) * ((count - 1) / (majority_test_class_count - 1))
    class_indices = np.where(test_labels == label)[0]
    desired_class_size = int(np.round((majority_test_class_count - 1) * scaling_factor)) + 1
    oversampled_class_indices = resample(class_indices, replace=True, n_samples=desired_class_size, random_state=42)

    upsampled_test_features.append(all_test_features[oversampled_class_indices])
    upsampled_test_labels.append(test_labels[oversampled_class_indices])

upsampled_test_features = np.concatenate(upsampled_test_features)
upsampled_test_labels = np.concatenate(upsampled_test_labels)

# see the class distribution of the upsampled test set
upsampled_test_class_distribution = np.bincount(upsampled_test_labels)
print('Class distribution of the upsampled test set: ', upsampled_test_class_distribution)

# visualize plots of the class distributions
import matplotlib.pyplot as plt
plt.bar(np.arange(len(upsampled_test_class_distribution)), upsampled_test_class_distribution, label='Test set')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import classification_report
# load the test features and labels
print('Begin prediction' + ' - ' + str(datetime.now()))
predictions = final_svm_classifier.predict(all_test_features)
print('Finish prediction' + ' - ' + str(datetime.now()))
report = classification_report(test_labels, predictions)
print(report)