In [None]:
import os

import numpy as np
import pandas as pd
import sklearn

In [None]:
try:
    from google.colab import drive
    drive.mount('/gdrive')
    core_path = "/path name/"
except ModuleNotFoundError:
    core_path = ""

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
asteroids_df = pd.read_pickle(os.path.join(core_path, "data/lvl2/", "asteroids.pkl"))


In [None]:
asteroids_df.loc[:, "Class"] = asteroids_df["Main_Group"].apply(lambda x: 1 if x=="X" else 0)

In [None]:
asteroids_X = np.array([k["Reflectance_norm550nm"].tolist() for k in asteroids_df["SpectrumDF"]])
asteroids_y = np.array(asteroids_df["Class"].to_list())

In [None]:
asteroids_X

array([[0.9281, 0.9388, 0.9488, ..., 1.0165, 1.0181, 1.02  ],
       [0.9758, 0.9788, 0.9816, ..., 0.9815, 0.9795, 0.9762],
       [0.8692, 0.8824, 0.8962, ..., 1.0055, 1.0058, 1.0038],
       ...,
       [0.935 , 0.9435, 0.952 , ..., 1.0451, 1.0486, 1.0532],
       [0.845 , 0.86  , 0.875 , ..., 1.059 , 1.0548, 1.0509],
       [0.919 , 0.9257, 0.9323, ..., 0.9971, 0.9852, 0.9731]])

In [None]:
asteroids_y

array([0, 0, 0, ..., 1, 0, 1])

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss= StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in sss.split(asteroids_X, asteroids_y):
  X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
  y_train, y_test = asteroids_y[train_index], asteroids_y[test_index]

In [None]:
print(sum(y_train) / len(y_train))
print(sum(y_test) / len(y_test))

0.17763157894736842
0.1797752808988764


In [None]:
print(f"Ratio of positive training classes: {round(sum(y_train) / len(X_train), 2)}")
print(f"Ratio of positive test classes: {round(sum(y_test) / len(X_test), 2)}")


Ratio of positive training classes: 0.18
Ratio of positive test classes: 0.18


In [None]:
positive_class_weight = int(1.0 / (sum(y_train) / len(X_train)))
print(f"Positive Class weightning: {positive_class_weight}")

Positive Class weightning: 5


In [None]:
#scaling
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

In [None]:
#trainingf
from sklearn import svm

wclf = svm.SVC(kernel='rbf', class_weight={1: positive_class_weight}, C=100)

# Perform the training
wclf.fit(X_train_scaled, y_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

y_test_pred = wclf.predict(X_test_scaled)

In [None]:
#metrics

from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_test_pred)

print (conf_mat)

[[214   5]
 [  0  48]]


In [None]:
recall_score = round(sklearn.metrics.recall_score(y_test, y_test_pred), 3)
print(f"Recall Score: {recall_score}")
precision_score = round(sklearn.metrics.precision_score(y_test, y_test_pred), 3)
print(precision_score)

f1_score = round(sklearn.metrics.f1_score(y_test, y_test_pred), 3)
print(f"F1 Score: {f1_score}")

Recall Score: 1.0
0.906
F1 Score: 0.95


In [None]:
asteroids_random_y = asteroids_y.copy()
np.random.shuffle(asteroids_random_y)

In [None]:
f1_score_naive = round(sklearn.metrics.f1_score(asteroids_y, asteroids_random_y), 3)
print(f"Naive F1 Score: {f1_score_naive}")

Naive F1 Score: 0.215
