In [61]:
import numpy as np
import pandas as pd
import nltk
#below line commented after download
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
stop = stopwords.words('english')
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

<b>SMS Spam Collection Data Set</b> <br/>
Dataset downloaded from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection <br/>
Data is seperated via tab delimited


In [132]:
url = '../../Datasets/smsspamcollection/SMSSpamCollection'
sms = pd.read_csv(url, sep='\t', names=["Category", "Message"])
sms.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [29]:
sms.Category.unique()

array(['ham', 'spam'], dtype=object)

In [30]:
sms['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [52]:
sms.iloc[0,1]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [144]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Stopword Removing & Stemming**

In [162]:
porter = nltk.PorterStemmer()
word_list = stopwords.words('english')

for i, column in sms.iterrows():
    #print(sms.Message[i])
    #print (i, column.Message)
    
    #splitting sentence into words
    splitted_row = column.Message.split(' ')
    
    #Stemming
    splitted_row = [porter.stem(t) for t in splitted_row]
    
    #Cleaning stopword
    cleaned_row = np.array(list(filter(lambda x: x not in word_list, splitted_row)))
    sms.Message[i] = " ".join(cleaned_row)
    
sms.head(10)

Unnamed: 0,Category,Message
0,ham,"Go jurong point, crazy.. avail bugi n great wo..."
1,ham,Ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win FA cup final tkt 21...
3,ham,U dun say earli hor... U c alreadi say...
4,ham,"nah I think goe usf, live around though"
5,spam,freemsg hey darl 3 week' word back! i'd like f...
6,ham,even brother like speak me. treat like aid pat...
7,ham,As per request 'mell mell (oru minnaminungint ...
8,spam,winner!! As valu network custom select receive...
9,spam,mobil 11 month more? U R entitl updat latest c...


Stopwords removed successfully

**CountVectorizer**

In [163]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
count_model = vectorizer.fit(sms["Message"])
X = count_model.transform(sms["Message"])

In [164]:
count_model.get_feature_names()

['call', 'come', 'free', 'get', 'go', 'gt', 'it', 'lt', 'ok', 'ur']

In [168]:
X.todense()[:5]

matrix([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

**TfidfVectorizer**

In [165]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
tfidf_model = vectorizer.fit(sms["Message"])
X = tfidf_model.transform(sms["Message"])

In [166]:
tfidf_model.get_feature_names()

['call', 'come', 'free', 'get', 'go', 'gt', 'it', 'lt', 'ok', 'ur']

In [167]:
X.todense()[:5]

matrix([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

**Classification**

In [169]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(sms["Message"])
pickle.dump(tfidf_model, open("../../Datasets/tfidf.pkl", "wb"))
X = tfidf_model.transform(sms["Message"])
X_train,X_test,y_train,y_test = train_test_split(X,sms["Category"],test_size = 0.1)
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
pickle.dump(clf, open("../../Datasets/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       496
        spam       0.96      0.82      0.89        62

    accuracy                           0.98       558
   macro avg       0.97      0.91      0.94       558
weighted avg       0.98      0.98      0.98       558

[[494   2]
 [ 11  51]]
