In [20]:
import csv
import string
import pandas as pd
from nltk.tokenize import word_tokenize

pd.set_option("display.max_colwidth", 250)

dataset = pd.read_csv("SMSSpamCollection", sep="\t", header=None)
dataset.columns = ["label", "sms_msgs"]
col1 = dataset.groupby("label") 
ratio = col1.count().values[0]/col1.count().values[1]
# cleaning 
def remove_punctuation(text):
    without_punct = [t for t in text if t not in string.punctuation]
    return "".join(without_punct)

In [21]:
from nltk.corpus import stopwords

dataset["clean_msg"] = dataset["sms_msgs"].apply(lambda msg: remove_punctuation(msg))
dataset["tokenized_msg"] = dataset["clean_msg"].apply(lambda msg: word_tokenize(msg.lower()))

# removing stopwords
def remove_sw(msg):
    words = list()
    for m in msg:
        if m not in stopwords.words("english"):
            words.append(m)
    return words

dataset["without_sw"] = dataset["tokenized_msg"].apply(lambda msg: remove_sw(msg))

In [22]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stem_msg(msg):
    words = list()
    for m in msg:
        words.append(ps.stem(m))
    return words

dataset["stemmed_msg"] = dataset["without_sw"].apply(lambda msg: stem_msg(msg))

In [23]:
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_msg(sent=None):
    if sent is None:
        sent = list()
    else:
        sent = sent
    new_sent = list()
    for s in sent:
        new_sent.append(lemmatizer.lemmatize(s))
    return new_sent

dataset["lemmatized_msg"] = dataset["without_sw"].apply(lambda msg: lemmatize_msg(msg))

In [24]:
# pre-processing 
def text_clean(msg):
    msg =  remove_punctuation(msg)
    msg = word_tokenize(msg)
    msg = remove_sw(msg)
    msg = stem_msg(msg)
    #msg = lemmatize_msg(msg)
    return msg

In [25]:
# vectorization 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf =  TfidfVectorizer(analyzer=text_clean)
# cv =  CountVectorizer(ngram_range(3,3)) # ngram 
x = tfidf.fit_transform(dataset["sms_msgs"])
dataFrame = pd.DataFrame(x.toarray())

In [26]:
# Feature engineering: added message length and punctuation %
dataset["msg_len"] = [len(msg) for msg in dataset["sms_msgs"]] 

def count_punc(msg):
    count = 0
    for m in msg:
        if m in string.punctuation:
            count += 1
    return (count/len(msg))*100

dataset["punct_perc"] = dataset["sms_msgs"].apply(lambda msg: count_punc(msg))
# rearrange columns
dataset = dataset[["label","sms_msgs", "msg_len", "punct_perc","clean_msg","tokenized_msg","without_sw", "stemmed_msg", "lemmatized_msg"]]

In [27]:
# training and prediction
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = dataFrame
y = np.array(dataset.label)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
x_train = np.array(x_train)
y_train = np.array(y_train)
lr = LogisticRegression()
lr_model = lr.fit(x_train, y_train)
results = lr_model.predict(x_test) # predicted values

In [None]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [None]:
# kfold data split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

k = 2
kfold = KFold(k, random_state=1337, shuffle=True)
lr_scores, rfc_scores, svm_scores = list(), list(), list()

for train_idx, test_idx in kfold.split(dataFrame):
    train_x, test_x, train_y, test_y = dataFrame[train_idx], dataFrame[test_idx], dataFrame[train_idx], dataFrame[test_idx]
    lr_scores.append(get_score(lr, x_train, x_test, y_train , y_test))
    rfc_scores.append(get_score(RandomForestClassifier(n_estimators=20), x_train, x_test, y_train , y_test))
    svm_scores.append(get_score(SVC(), x_train, x_test, y_train , y_test))