In [15]:
import numpy as np
import pandas as pd
import nltk
#below line commented after download
#nltk.download('stopwords')
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

<b>SMS Spam Collection Data Set</b> <br/>
Dataset downloaded from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection <br/>
Data is seperated via tab delimited


In [27]:
url = '../../Datasets/smsspamcollection/SMSSpamCollection'
sms = pd.read_csv(url, sep='\t', names=["Category", "Message"])
sms.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [29]:
sms.Category.unique()

array(['ham', 'spam'], dtype=object)

In [30]:
sms['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [31]:
sms.iloc[0,1]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

**CountVectorizer**

In [32]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
count_model = vectorizer.fit(sms["Message"])
X = count_model.transform(sms["Message"])

In [34]:
count_model.get_feature_names()

['and', 'for', 'in', 'is', 'it', 'me', 'my', 'the', 'you', 'your']

In [38]:
X.todense()[:5]

matrix([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

**TfidfVectorizer**

In [41]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
tfidf_model = vectorizer.fit(sms["Message"])
X = tfidf_model.transform(sms["Message"])

In [42]:
tfidf_model.get_feature_names()

['and', 'for', 'in', 'is', 'it', 'me', 'my', 'the', 'you', 'your']

In [43]:
X.todense()[:5]

matrix([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

**Classification**

In [45]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(sms["Message"])
pickle.dump(tfidf_model, open("../../Datasets/tfidf.pkl", "wb"))
X = tfidf_model.transform(sms["Message"])
X_train,X_test,y_train,y_test = train_test_split(X,sms["Category"],test_size = 0.1)
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
pickle.dump(clf, open("../../Datasets/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       486
        spam       1.00      0.82      0.90        72

    accuracy                           0.98       558
   macro avg       0.99      0.91      0.94       558
weighted avg       0.98      0.98      0.98       558

[[486   0]
 [ 13  59]]
