In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
import io

# Cant load csv file ( utf-8 decode error ), going to fix it
with open('data/spamSMS.csv', 'rb') as f:
    data = f.read()

text = data.decode('utf-8', errors='replace')

# Replace invalid character
text = text.replace('\ufffd', '?')  

# Load data
df = pd.read_csv(io.StringIO(text))

In [30]:
# Examine class distribution
print(df['v1'].value_counts()) 

ham     4825
spam     747
Name: v1, dtype: int64


In [31]:
# Vectorize text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['v2'])
y = df['v1']

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# RBF kernel

In [32]:
svm = SVC(kernel='rbf')

In [33]:
# Grid search hyperparameters
grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid_search = GridSearchCV(svm, grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

Best Parameters: {'C': 10, 'gamma': 0.01}
Best Score: 0.9825641025641026


In [34]:
# Random search hyperparameters
random_grid = {'C': [0.01, 0.1, 1, 10, 100], 
               'gamma': [0.001, 0.01, 0.01, 1, 10]}
random_search = RandomizedSearchCV(svm, param_distributions=random_grid, scoring='accuracy', cv=5)
random_search.fit(X_train, y_train) 
print('Best Parameters:', random_search.best_params_)
print('Best Score:', random_search.best_score_)

Best Parameters: {'gamma': 0.01, 'C': 10}
Best Score: 0.9825641025641026


In [35]:
# Evaluate on test set
y_pred = random_search.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9802631578947368
[[1447    6]
 [  27  192]]


# Linear kernel

In [36]:
svm = SVC(kernel='linear')

In [37]:
# Grid search hyperparameters
grid = {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10, 100]}
grid_search = GridSearchCV(svm, grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

Best Parameters: {'C': 0.5}
Best Score: 0.9830769230769232


In [38]:
# Random search hyperparameters
random_grid = {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10, 100]}
random_search = RandomizedSearchCV(svm, param_distributions=random_grid, scoring='accuracy', cv=5)
random_search.fit(X_train, y_train) 
print('Best Parameters:', random_search.best_params_)
print('Best Score:', random_search.best_score_)

Best Parameters: {'C': 0.7}
Best Score: 0.9828205128205129


In [39]:
# Evaluate on test set
y_pred = random_search.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9778708133971292
[[1448    5]
 [  32  187]]
