In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
# Load the preprocessed dataset
df = pd.read_csv('./archive/prep_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 932335 entries, 0 to 932334
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   neo     932335 non-null  object 
 1   pha     932335 non-null  object 
 2   H       932335 non-null  float64
 3   epoch   932335 non-null  float64
 4   e       932335 non-null  float64
 5   a       932335 non-null  float64
 6   q       932335 non-null  float64
 7   i       932335 non-null  float64
 8   om      932335 non-null  float64
 9   w       932335 non-null  float64
 10  ma      932335 non-null  float64
 11  ad      932335 non-null  float64
 12  n       932335 non-null  float64
 13  tp      932335 non-null  float64
 14  per     932335 non-null  float64
 15  moid    932335 non-null  float64
 16  class   932335 non-null  object 
 17  rms     932335 non-null  float64
dtypes: float64(15), object(3)
memory usage: 128.0+ MB


In [4]:
categorical_columns = df.select_dtypes('object').columns

In [5]:
# One hot encoder
for c in categorical_columns:
    if (c == 'pha'):
        continue
    for l in df[c].unique():
        df[c+"_"+l] = (df[c] == l).astype(int)

df.drop(columns=['class', 'neo'], inplace=True)

In [6]:
df.columns

Index(['pha', 'H', 'epoch', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'ad', 'n',
       'tp', 'per', 'moid', 'rms', 'neo_N', 'neo_Y', 'class_MBA', 'class_OMB',
       'class_MCA', 'class_AMO', 'class_IMB', 'class_TJN', 'class_CEN',
       'class_APO', 'class_ATE', 'class_AST', 'class_TNO', 'class_IEO'],
      dtype='object')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['pha']), df['pha'], train_size=0.75)

In [11]:
# Our pha label is super unbalanced. Let's use oversampling techniques
y_train.value_counts()

pha
N    697662
Y      1589
Name: count, dtype: int64

In [13]:
# Show the shapes of x_train and y_train before oversampling
print("x_train before adasyn:", X_train.shape)
print("y_train before adasyn:", y_train.shape)

x_train before adasyn: (699251, 29)
y_train before adasyn: (699251,)


In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

In [14]:
# We try ADASYN (Adaptive Synthetic Sampling)

adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)
print("Label distributions of y_train after ADASYN")
print(pd.Series(y_train).value_counts())

Label distributions of y_train after ADASYN
pha
N    697662
Y    697403
Name: count, dtype: int64


In [15]:
print("x_train after adasyn:", X_train.shape)
print("y_train after adasyn:", y_train.shape)

x_train after adasyn: (1395065, 29)
y_train after adasyn: (1395065,)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 932335 entries, 0 to 932334
Data columns (total 30 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   pha        932335 non-null  object 
 1   H          932335 non-null  float64
 2   epoch      932335 non-null  float64
 3   e          932335 non-null  float64
 4   a          932335 non-null  float64
 5   q          932335 non-null  float64
 6   i          932335 non-null  float64
 7   om         932335 non-null  float64
 8   w          932335 non-null  float64
 9   ma         932335 non-null  float64
 10  ad         932335 non-null  float64
 11  n          932335 non-null  float64
 12  tp         932335 non-null  float64
 13  per        932335 non-null  float64
 14  moid       932335 non-null  float64
 15  rms        932335 non-null  float64
 16  neo_N      932335 non-null  int64  
 17  neo_Y      932335 non-null  int64  
 18  class_MBA  932335 non-null  int64  
 19  class_OMB  932335 non-n

In [24]:
# Standardize all the numerical values
numerical_columns = df.select_dtypes('float64').columns
numerical_columns

Index(['H', 'epoch', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'ad', 'n', 'tp',
       'per', 'moid', 'rms'],
      dtype='object')

In [25]:
stds = dict()  # std devs of each numerical feature column
means = dict()      # means of each numerical feature column

# perform z-score normalization
for c in numerical_columns:
    # calculate sigma and mu for X_train
    stds[c] = X_train[c].std()
    means[c] = X_train[c].mean()

    # fit X_train
    X_train[c] = (X_train[c]-means[c])/stds[c]
    # transform X_test accordingly
    X_test[c] = (X_test[c]-means[c])/stds[c]

In [30]:
# encode the output class pha to False (N) and True (Y)
y_train = (y_train == 'Y').astype(int)
y_test = (y_test == 'Y').astype(int)

In [34]:
svm_model1 = SVC(kernel='linear', verbose=True)
svm_model1.fit(X_train, y_train)

[LibSVM]

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [None]:
# save the model

import pickle

with open('svm_model1_linear.pkl', 'wb') as f:
    pickle.dump(svm_model1, f)

In [36]:
y_pred = svm_model1.predict(X_test)

In [37]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    232607
           1       0.36      1.00      0.53       477

    accuracy                           1.00    233084
   macro avg       0.68      1.00      0.76    233084
weighted avg       1.00      1.00      1.00    233084



In [39]:
# training accuracy
print(classification_report(y_train, svm_model1.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    697662
           1       1.00      1.00      1.00    697403

    accuracy                           1.00   1395065
   macro avg       1.00      1.00      1.00   1395065
weighted avg       1.00      1.00      1.00   1395065



In [38]:
svm_model2 = SVC(kernel='rbf', verbose=True)
svm_model2.fit(X_train, y_train)

with open('svm_model2_rbf.pkl', 'wb') as f:
    pickle.dump(svm_model2, f)

y_pred = svm_model2.predict(X_test)
print(classification_report(y_test, y_pred))

[LibSVM]              precision    recall  f1-score   support

           0       1.00      1.00      1.00    232607
           1       0.35      0.97      0.51       477

    accuracy                           1.00    233084
   macro avg       0.68      0.98      0.76    233084
weighted avg       1.00      1.00      1.00    233084



In [40]:
# rbf training accuracy
print(classification_report(y_train, svm_model2.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    697662
           1       1.00      1.00      1.00    697403

    accuracy                           1.00   1395065
   macro avg       1.00      1.00      1.00   1395065
weighted avg       1.00      1.00      1.00   1395065

