In [1]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [2]:
X_tr = pd.read_csv('train_features.csv')
y_tr = pd.read_csv('train_labels.csv')

X_val = pd.read_csv('validation_features.csv')
y_val = pd.read_csv('validation_labels.csv')

X_te = pd.read_csv('test_features.csv')
y_te = pd.read_csv('test_labels.csv')

In [15]:
print('Training set:', X_tr.shape[0])
print('Validation set:', X_val.shape[0])
print('Test set:', X_te.shape[0])
print('Total size:', X_tr.shape[0] + X_val.shape[0] + X_te.shape[0])

Training set: 3781
Validation set: 1261
Test set: 1261
Total size: 6303


In [4]:
def print_results(results):
    print("Best parameters: {}\n".format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']

    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print("Mean: {:.2f}, Std: {:.2f} for {}".format(mean, std, params))

In [5]:
svc_tuned = SVC()
parameters = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10, 100]
}

cv = GridSearchCV(svc_tuned, parameters, cv=5)
cv.fit(X_tr, y_tr.values.ravel())
print_results(cv)

Best parameters: {'C': 1, 'kernel': 'linear'}

Mean: 0.97, Std: 0.01 for {'C': 0.1, 'kernel': 'linear'}
Mean: 0.98, Std: 0.01 for {'C': 1, 'kernel': 'linear'}
Mean: 0.98, Std: 0.01 for {'C': 10, 'kernel': 'linear'}
Mean: 0.98, Std: 0.01 for {'C': 100, 'kernel': 'linear'}


In [11]:
print(cv.best_estimator_)

SVC(C=1, kernel='linear')


In [6]:
best_params = cv.best_params_

In [7]:
svm_base = SVC()
svm_base.fit(X_tr, y_tr.values.ravel())

svm_linear = SVC(kernel='linear')
svm_linear.fit(X_tr, y_tr.values.ravel())

svm_poly = SVC(kernel='poly')
svm_poly.fit(X_tr, y_tr.values.ravel())

svm_tuned = SVC(**best_params)
svm_tuned.fit(X_tr, y_tr.values.ravel())

lr_base = LogisticRegression(max_iter=1000)
lr_base.fit(X_tr, y_tr.values.ravel())

In [8]:
print("Validation set results")
for mdl in [svm_base, svm_linear, svm_poly, svm_tuned, lr_base]:
    y_pred = mdl.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred) * 100
    precision = precision_score(y_val, y_pred) * 100
    recall = recall_score(y_val, y_pred) * 100
    f1 = f1_score(y_val, y_pred) * 100
    
    print(mdl)
    print("Accuracy: {:.2f}%".format(accuracy))
    print("Precision: {:.2f}%".format(precision))
    print("Recall: {:.2f}%".format(recall))
    print("F1: {:.2f}%".format(f1))
    print()

Validation set results
SVC()
Accuracy: 94.77%
Precision: 97.32%
Recall: 81.15%
F1: 88.50%

SVC(kernel='linear')
Accuracy: 97.30%
Precision: 96.66%
Recall: 92.33%
F1: 94.44%

SVC(kernel='poly')
Accuracy: 94.21%
Precision: 95.45%
Recall: 80.51%
F1: 87.35%

SVC(C=1, kernel='linear')
Accuracy: 97.30%
Precision: 96.66%
Recall: 92.33%
F1: 94.44%

LogisticRegression(max_iter=1000)
Accuracy: 96.91%
Precision: 96.60%
Recall: 90.73%
F1: 93.57%



In [16]:
import joblib

joblib.dump(svm_tuned, 'best_model.pkl')

['best_model.pkl']

In [12]:
print("Test set results for the best model")
y_pred = svm_tuned.predict(X_te)
accuracy = accuracy_score(y_te, y_pred) * 100
precision = precision_score(y_te, y_pred) * 100
recall = recall_score(y_te, y_pred) * 100
f1 = f1_score(y_te, y_pred) * 100

print("Accuracy: {:.2f}%".format(accuracy))
print("Precision: {:.2f}%".format(precision))
print("Recall: {:.2f}%".format(recall))
print("F1: {:.2f}%".format(f1))
print()

Test set results for the best model
Accuracy: 97.46%
Precision: 96.44%
Recall: 92.49%
F1: 94.43%



In [9]:
print("Test set results")
for mdl in [svm_base, svm_linear, svm_poly, svm_tuned, lr_base]:
    y_pred = mdl.predict(X_te)
    accuracy = accuracy_score(y_te, y_pred) * 100
    precision = precision_score(y_te, y_pred) * 100
    recall = recall_score(y_te, y_pred) * 100
    f1 = f1_score(y_te, y_pred) * 100
    
    print(mdl)
    print("Accuracy: {:.2f}%".format(accuracy))
    print("Precision: {:.2f}%".format(precision))
    print("Recall: {:.2f}%".format(recall))
    print("F1: {:.2f}%".format(f1))
    print()

Test set results
SVC()
Accuracy: 94.92%
Precision: 95.98%
Recall: 81.57%
F1: 88.19%

SVC(kernel='linear')
Accuracy: 97.46%
Precision: 96.44%
Recall: 92.49%
F1: 94.43%

SVC(kernel='poly')
Accuracy: 94.69%
Precision: 95.56%
Recall: 80.89%
F1: 87.62%

SVC(C=1, kernel='linear')
Accuracy: 97.46%
Precision: 96.44%
Recall: 92.49%
F1: 94.43%

LogisticRegression(max_iter=1000)
Accuracy: 97.30%
Precision: 96.75%
Recall: 91.47%
F1: 94.04%

