In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [4]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

# funfact: X is capitalised bc it is a larger matrix, whereas y is 1 dimensional
X = df['message']
y = df['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Count Vectoriser

In [7]:
# Count vectoriser includes text preprocessing, tokenization, and filter stop words
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [8]:
# Fit transform
# fit vectorizer to data (build vocab, count words...)
# transform text to vector
count_vect.fit(X)
X_train = count_vect.transform(X_train)
X_test = count_vect.transform(X_test)

In [17]:
X_train.shape

(3733, 8713)

## Train model classifier

In [10]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

In [11]:
clf.fit(X_train, y_train)

LinearSVC()

In [12]:
predictions = clf.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

[[1593    0]
 [  20  226]]


In [14]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       1.00      0.92      0.96       246

    accuracy                           0.99      1839
   macro avg       0.99      0.96      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [15]:
from sklearn import metrics

metrics.accuracy_score(y_test, predictions)

0.9891245241979336

# Test on our own data

In [20]:
test = count_vect.transform(['Hi how are you doing today?'])
clf.predict(test)

array(['ham'], dtype=object)

In [21]:
test = count_vect.transform(['Congratulations! you have been selected as winner TEXT WON 44355 free entry to contest.'])
clf.predict(test)


array(['spam'], dtype=object)