In [None]:
import os

# Import installed libraries
import numpy as np
import pandas as pd
import sklearn

from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
try:
    from google.colab import drive
    drive.mount('/gdrive')
    core_path = "/path name/"
except ModuleNotFoundError:
    core_path = ""

Mounted at /gdrive


In [None]:
asteroids_df = pd.read_pickle(os.path.join(core_path, "data/lvl2/", "asteroids.pkl"))


In [None]:
asteroids_X = np.array([k["Reflectance_norm550nm"].tolist() for k in asteroids_df["SpectrumDF"]])
asteroids_y = np.array(asteroids_df["Main_Group"].to_list())

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

# Create a simple, single train / test split
for train_index, test_index in sss.split(asteroids_X, asteroids_y):

    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    y_train, y_test = asteroids_y[train_index], asteroids_y[test_index]

# Compute class weightnings
weight_dict = {}
for ast_type in np.unique(y_train):
    weight_dict[ast_type] = int(1.0 / (len(y_train[y_train == ast_type]) / (len(y_train))))

In [None]:
from sklearn.metrics import f1_score, make_scorer

param_grid = [
  {'C': np.logspace(0, 3.5, 25), 'kernel': ['linear']},
  {'C': np.logspace(0, 3.5, 25), 'kernel': ['rbf']},
 ]

svc = svm.SVC(class_weight=weight_dict)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

wclf = GridSearchCV(svc, param_grid, scoring=make_scorer(f1_score, average="weighted"), verbose=3,
                     cv=5)
wclf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ..............C=1.0, kernel=linear;, score=0.830 total time=   0.3s
[CV 2/5] END ..............C=1.0, kernel=linear;, score=0.861 total time=   0.3s
[CV 3/5] END ..............C=1.0, kernel=linear;, score=0.828 total time=   0.2s
[CV 4/5] END ..............C=1.0, kernel=linear;, score=0.823 total time=   0.1s
[CV 5/5] END ..............C=1.0, kernel=linear;, score=0.834 total time=   0.2s
[CV 1/5] END C=1.3990503141372939, kernel=linear;, score=0.828 total time=   0.3s
[CV 2/5] END C=1.3990503141372939, kernel=linear;, score=0.853 total time=   0.4s
[CV 3/5] END C=1.3990503141372939, kernel=linear;, score=0.816 total time=   0.3s
[CV 4/5] END C=1.3990503141372939, kernel=linear;, score=0.827 total time=   0.2s
[CV 5/5] END C=1.3990503141372939, kernel=linear;, score=0.834 total time=   0.3s
[CV 1/5] END C=1.9573417814876604, kernel=linear;, score=0.837 total time=   0.4s
[CV 2/5] END C=1.9573417814876604, kernel

In [None]:
final_clf = wclf.best_estimator_

print(f"Kernel with the best result: {final_clf.kernel}")
print(f"SVM information: {final_clf}")

NameError: ignored

In [None]:
X_test_scaled = scaler.transform(X_test)

# ... and perform a predicition
y_test_pred = final_clf.predict(X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_test_pred, labels=["C", "S", "X", "Other"])

print(conf_mat)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(final_clf, X_test_scaled, y_test, values_format='d')
plt.show()

In [None]:
f1_score = round(sklearn.metrics.f1_score(y_test, y_test_pred, average="weighted"), 3)
print(f"F1 Score: {f1_score}")