In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('spam.tsv', sep = '\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [4]:
df.shape

(5572, 4)

In [5]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

Balance Dataset

In [6]:
ham = df[df['label']=='ham']
ham.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
6,ham,Even my brother is not like to speak with me. ...,77,2


In [7]:
spam = df[df['label']=='spam']
spam.head()

Unnamed: 0,label,message,length,punct
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136,8


In [8]:
ham.shape, spam.shape

((4825, 4), (747, 4))

In [9]:
ham = ham.sample(spam.shape[0])

In [10]:
ham.shape, spam.shape

((747, 4), (747, 4))

In [11]:
data = ham.append(spam, ignore_index=True)

In [12]:
data.sample(5)

Unnamed: 0,label,message,length,punct
1103,spam,Knock Knock Txt whose there to 80082 to enter ...,154,6
1167,spam,Free entry in 2 a weekly comp for a chance to ...,143,6
36,ham,Its worse if if uses half way then stops. Its ...,76,2
1062,spam,88066 FROM 88066 LOST 3POUND HELP,33,0
405,ham,What makes you most happy?,26,1


In [13]:
data.shape

(1494, 4)

In [14]:
data['label'].value_counts()

ham     747
spam    747
Name: label, dtype: int64

### Exploratory Data Analysis 

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,Hope you are having a good week. Just checking in,49,1
1,ham,I am on the way to tirupur.,27,1
2,ham,:-( that's not v romantic!,26,5
3,ham,"Oh, i will get paid. The most outstanding one ...",159,8
4,ham,It didnt work again oh. Ok goodnight then. I.l...,146,5


In [18]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(data['message'])

In [19]:
X = X.toarray()

In [20]:
X.shape

(1494, 4593)

In [21]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size = 0.2, random_state = 0, stratify = data['label'])

In [23]:
X_train.shape, X_test.shape

((1195, 4593), (299, 4593))

### Training Your First Text Classifier 
SVM

In [24]:
clf = SVC(C = 1000, gamma = 'auto')

In [25]:
clf.fit(X_train, y_train)

SVC(C=1000, gamma='auto')

In [26]:
y_pred = clf.predict(X_test)

In [27]:
confusion_matrix(y_test, y_pred)

array([[149,   1],
       [ 18, 131]], dtype=int64)

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.89      0.99      0.94       150
        spam       0.99      0.88      0.93       149

    accuracy                           0.94       299
   macro avg       0.94      0.94      0.94       299
weighted avg       0.94      0.94      0.94       299



### Test Model with Real Data 

In [29]:
clf

SVC(C=1000, gamma='auto')

In [30]:
def predict(x):
    x = tfidf.transform([x])
    x = x.toarray()
    pred = clf.predict(x)
    return pred

In [31]:
predict('hey, whassup')

array(['ham'], dtype=object)

In [32]:
predict('you have got free tickets to the usa this summer')

array(['spam'], dtype=object)