In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

from sklearn.svm import SVC

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'
data = pd.read_json(DATA_JSON_FILE)
data.sort_index(inplace=True)
vectorizer = CountVectorizer(stop_words='english')
all_features = vectorizer.fit_transform(data.MESSAGE)
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=88)

In [3]:
classifier = SVC(verbose=1)
classifier.fit(X_train, y_train)

[LibSVM]

SVC(verbose=1)

In [4]:
result = classifier.predict(X_test)
print(confusion_matrix(y_test, result))

[[1179    6]
 [ 190  364]]


In [5]:
classifier.score(X_test, y_test)

0.887291546866015

In [6]:
recall_score(y_test, classifier.predict(X_test))

0.6570397111913358

In [7]:
precision_score(y_test, classifier.predict(X_test))

0.9837837837837838

In [8]:
f1_score(y_test, classifier.predict(X_test))

0.7878787878787878

In [9]:
example = ['get iphone for free now!', 'need mortgage? Reply to arrange a call with a specialist and get a quote', 'could you please help me with the project for tomorrow?', 'Hello Jonathan, how about a game of golf tomorrow?', 'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.']

In [10]:
doc_term_matrix = vectorizer.transform(example)

In [11]:
classifier.predict(doc_term_matrix)

array([0, 0, 0, 0, 0], dtype=int64)