# SMS Spam Collection Dataset

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.(https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [2]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), ".."))

## Packages

In [15]:
import re

import pandas as pd
import numpy as np

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

## Dataset

In [4]:
dataset = pd.read_csv("../datasets/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
dataset = dataset.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
dataset = dataset.rename(columns={"v1":"label", "v2":"text"})

In [5]:
dataset.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
dataset.groupby(["label"]).agg(["count"])

Unnamed: 0_level_0,text
Unnamed: 0_level_1,count
label,Unnamed: 1_level_2
ham,4825
spam,747


## Text processing

- Cleaning text data
    - user lowercase
    - remove non-word characters (excluding emoticons)
    - remove numbers

In [7]:
def preprocessor(text):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub("[\W\d]+", " ", text.lower()) + ' '.join(emoticons).replace("-", "")
    return text

In [8]:
dataset["text"] = dataset["text"].apply(preprocessor)

### Utils

In [9]:
def tokenizer(text):
    return text.split()

def tokenizer_stemmer(stemmer):
    def tokenizer(text):
        return [ stemmer.stem(word) for word in text.split() ]
    return tokenizer

tokenizer_porter = tokenizer_stemmer(PorterStemmer())

In [10]:
tokenizer_porter("I am riding my bike")

['I', 'am', 'ride', 'my', 'bike']

## Training & test dataset

In [11]:
y = dataset.label.map({"ham": 0, "spam": 1})
X = dataset["text"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [13]:
print("Training sample:\n", pd.Series(y_train).value_counts(normalize=True))
print("Test sample:\n", pd.Series(y_test).value_counts(normalize=True))

Training sample:
 0    0.865897
1    0.134103
Name: label, dtype: float64
Test sample:
 0    0.866029
1    0.133971
Name: label, dtype: float64


## Logistic Regression

In [208]:
lr_tfidf = Pipeline([
    ("vect", TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)),
    ("clf", LogisticRegression())
])

In [212]:
stop = stopwords.words("english")

param_grid = [
    {
        'vect__ngram_range': [(1, 1), (2, 2)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]
    },
#     {
#         'vect__ngram_range': [(1, 1), (2, 2)],
#         'vect__stop_words': [stop, None],
#         'vect__tokenizer': [tokenizer, tokenizer_porter],
#         'vect__use_idf':[False],
#         'vect__norm':[None],
#         'clf__penalty': ['l1', 'l2'],
#         'clf__C': [1.0, 10.0, 100.0]
#     }
]

In [213]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  7.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1), (2, 2)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it...<locals>.tokenizer at 0x7fe8d3965158>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [214]:
gs_lr_tfidf.best_params_

{'clf__C': 100.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer_stemmer.<locals>.tokenizer>}

In [216]:
clf = gs_lr_tfidf.best_estimator_

In [220]:
print("Test accuracy: %.3f" % clf.score(X_test, y_test))

Test accuracy: 0.986


In [223]:
y_test_pred = clf.predict(X_test)

In [225]:
dataset.iloc[313,]

label                                                  ham
text     hi the way i was with u day is the normal way ...
Name: 313, dtype: object

## Bag-of-words

In [198]:
count = CountVectorizer(stop_words="english", ngram_range=(2, 2)).fit(dataset["text"])
bag = count.transform(dataset["text"])

In [199]:
count.get_feature_names()

['____ joy',
 'aa exhaust',
 'aah bless',
 'aah cuddle',
 'aah speak',
 'aaniye pudunga',
 'aaooooright work',
 'aathi dear',
 'aathi love',
 'ab sara',
 'abbey happy',
 'abdomen gynae',
 'abeg make',
 'aberdeen united',
 'abi hw',
 'abi just',
 'ability listen',
 'ability question',
 'abj serving',
 'able atten',
 'able buy',
 'able come',
 'able deliver',
 'able dont',
 'able eat',
 'able friday',
 'able half',
 'able join',
 'able kids',
 'able late',
 'able little',
 'able ll',
 'able met',
 'able morning',
 'able ors',
 'able pay',
 'able raise',
 'able reply',
 'able shopping',
 'able sleep',
 'able value',
 'abnormally ll',
 'aboutas chance',
 'abroad lonely',
 'absence gud',
 'absolutely love',
 'absolutly fine',
 'abstract wake',
 'abt abt',
 'abt character',
 'abt concentrate',
 'abt dvg',
 'abt events',
 'abt functions',
 'abt half',
 'abt leona',
 'abt making',
 'abt mei',
 'abt movie',
 'abt muz',
 'abt rite',
 'abt rows',
 'abt syd',
 'abt tat',
 'abt tel',
 'abt tht',
 '

In [39]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
tfidf.fit(bag)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [54]:
tfidf.transform(bag).toarray()[2,1:]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.44967417, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [55]:
dataset["text"]

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

## Topic modelling

In [153]:
count = CountVectorizer(stop_words="english", max_df=0.1, max_features=5000)
X = count.fit_transform(dataset["text"].values)

In [154]:
lda = LatentDirichletAllocation(n_topics=5, random_state=1, learning_method="batch")
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=5, perp_tol=0.1,
             random_state=1, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [155]:
X_topics = lda.transform(X)

In [157]:
n_top_words = 10
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d" % (topic_idx))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

Topic 0
know love day good dont just great like send life
Topic 1
ok going home want yes don work just da doing
Topic 2
free txt ur stop reply mobile text send just www
Topic 3
gt lt ll sorry later ì_ lor like got come
Topic 4
ur come night time good just today meet got ll


In [158]:
dataset["topic"] = X_topics.argmax(axis=1)

In [159]:
def label_counter(label):
    def counter(values):
        return (values == label).sum()
    counter.__name__ = label
    return counter

In [160]:
topic_stats = dataset[["label", "topic"]]\
    .groupby(by="topic")\
    .agg(["count", label_counter("spam"), label_counter("ham")])\
    .rename(columns={"count": "N"})["label"]
topic_stats["spam_ratio"] = topic_stats["spam"]/topic_stats["N"]
topic_stats["ham_ratio"] = topic_stats["ham"]/topic_stats["N"]

In [161]:
topic_stats

Unnamed: 0_level_0,N,spam,ham,spam_ratio,ham_ratio
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1323,47,1276,0.035525,0.964475
1,1153,14,1139,0.012142,0.987858
2,911,500,411,0.548847,0.451153
3,1103,97,1006,0.087942,0.912058
4,1082,89,993,0.082255,0.917745
