In [1]:
import numpy as np
import pandas as pd
import nltk
#below line commented after download
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
stop = stopwords.words('english')
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import string

<b>SMS Spam Collection Data Set</b> <br/>
Dataset downloaded from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection <br/>
Data is seperated via tab delimited


In [2]:
url = '../../Datasets/smsspamcollection/SMSSpamCollection'
sms = pd.read_csv(url, sep='\t', names=["Category", "Message"])
sms.head(10)
sms_copy = sms
sms_copy

Unnamed: 0,Category,Message
0,“tv future in the hands of viewers with home t...,


In [38]:
sms.Category.unique()

array(['ham', 'spam'], dtype=object)

In [39]:
sms['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [40]:
sms.iloc[0,1]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [41]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Cleaning Text in 'Message' Column** in for loop
<br/>
<ol><li>Stopwords Removing </li><li>Stemming</li><li>Cleaning punctuation</li></ol>

In [53]:
porter = nltk.PorterStemmer()
word_list = stopwords.words('english')
table = str.maketrans('', '', string.punctuation)

for i, column in sms.iterrows():
    #print(sms.Message[i])
    #print (i, column.Message)
    
    #splitting sentence into words
    splitted_row = column.Message.split(' ')
    
    #Stemming
    stemming_row = [porter.stem(t) for t in splitted_row]
    
    #Punctuation cleaning
    punctuation_clean_row = [w.translate(table) for w in stemming_row]
    
    #Cleaning stopword
    cleaned_row = np.array(list(filter(lambda x: x not in word_list, punctuation_clean_row)))
    sms.Message[i] = " ".join(cleaned_row)
    
sms

Unnamed: 0,Category,Message
0,ham,Go jurong point crazi avail onli bugi n great ...
1,ham,Ok lar joke wif u oni
2,spam,free entri 2 wkli comp win FA cup final tkt 21...
3,ham,U dun say earli hor U c alreadi say
4,ham,nah I dont think goe usf live around though
...,...,...
5567,spam,thi 2nd time tri 2 contact u U £750 pound priz...
5568,ham,ü b go esplanad fr home
5569,ham,piti wa mood soani suggest
5570,ham,guy bitch I act like id interest buy someth el...


Stopwords removed, stemming has done, punctuation has cleaned successfully

**CountVectorizer**

In [54]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
count_model = vectorizer.fit(sms["Message"])
X = count_model.transform(sms["Message"])
count_model.get_feature_names()

['call', 'come', 'dont', 'get', 'go', 'im', 'ltgt', 'ok', 'thi', 'ur']

In [55]:
X.todense()[:5]

matrix([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

**TfidfVectorizer**

In [50]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
tfidf_model = vectorizer.fit(sms["Message"])
X = tfidf_model.transform(sms["Message"])
tfidf_model.get_feature_names()

['call', 'come', 'dont', 'get', 'go', 'im', 'ltgt', 'ok', 'thi', 'ur']

In [51]:
X.todense()[:5]

matrix([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

**Classification**

In [52]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(sms["Message"])
pickle.dump(tfidf_model, open("../../Datasets/tfidf.pkl", "wb"))
X = tfidf_model.transform(sms["Message"])
X_train,X_test,y_train,y_test = train_test_split(X,sms["Category"],test_size = 0.1)
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
pickle.dump(clf, open("../../Datasets/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       493
        spam       0.96      0.80      0.87        65

    accuracy                           0.97       558
   macro avg       0.97      0.90      0.93       558
weighted avg       0.97      0.97      0.97       558

[[491   2]
 [ 13  52]]
