In [1]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('sms_spam.csv')
df.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [9]:
df.isnull().sum()
#To check whether we are missing any data

type    0
text    0
dtype: int64

In [11]:
df['type'].unique()

array(['ham', 'spam'], dtype=object)

In [12]:
df['type'].value_counts()

ham     4812
spam     747
Name: type, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [22]:
X = df['text']
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3,random_state = 40)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

In [25]:
#Tranform the original text message to the vector
X_train_counts = count_vec.fit_transform(X_train)

In [27]:
X_train_counts.shape

(3891, 7207)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)

In [33]:
from sklearn.svm import LinearSVC

In [34]:
clf = LinearSVC()

In [35]:
clf.fit(X_train_tfidf,y_train)

LinearSVC()

In [36]:
from sklearn.pipeline import Pipeline

In [37]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [38]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [39]:
predictions = text_clf.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

In [41]:
print(confusion_matrix(y_test,predictions))

[[1458    1]
 [  21  188]]


In [44]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1459
        spam       0.99      0.90      0.94       209

    accuracy                           0.99      1668
   macro avg       0.99      0.95      0.97      1668
weighted avg       0.99      0.99      0.99      1668



In [45]:
from sklearn import metrics
metrics.accuracy_score(y_test,predictions)

0.986810551558753

In [51]:
import pandas as pd

In [53]:
df2 = pd.read_csv('spam_ham_dataset.csv')
df2

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [55]:
df2.drop(['Unnamed: 0'],axis=1)

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [59]:
df2.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [58]:
df2['label'].unique()

array(['ham', 'spam'], dtype=object)

In [60]:
df2['label'].value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

## Performing the train test split

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
X = df2['text']
y = df2['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 37)

In [63]:
len(X_train),len(X_test)

(3619, 1552)

## Performing the fit directly using pipeline

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [66]:
tfidf = TfidfVectorizer()
clf = LinearSVC()

In [67]:
from sklearn.pipeline import Pipeline

In [68]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [69]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [70]:
predictions = text_clf.predict(X_test)

## Printing the confusion matrix, classification report and accuracy score

In [71]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [72]:
print(confusion_matrix(y_test,predictions))

[[1074   12]
 [   8  458]]


In [73]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1086
        spam       0.97      0.98      0.98       466

    accuracy                           0.99      1552
   macro avg       0.98      0.99      0.98      1552
weighted avg       0.99      0.99      0.99      1552



In [74]:
print(accuracy_score(y_test,predictions))

0.9871134020618557
