In [1]:
'''!pip install nltk
nltk.download('stopwords')
nltk.download('wordnet')'''

"!pip install nltk\nnltk.download('stopwords')\nnltk.download('wordnet')"

In [2]:
import nltk
import pandas as pd
import scipy.stats as stats
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_table("SMSCollection.csv",header = None, sep = ',')
data.rename(columns={0: 'spam', 1: 'text'}, inplace=True)
data = data.iloc[1:]
data.head()

Unnamed: 0,spam,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data['spam'].value_counts()

ham     4825
spam     752
Name: spam, dtype: int64

#### Data preprocessing

In [5]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

In [6]:
def tokenize(text):
    tokens = re.split('\W+', text.lower())
    return tokens

In [7]:
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

In [8]:
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [9]:
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [10]:
data['cleaned'] = data['text'].apply(remove_punct)
data['tokenized'] = data['cleaned'].apply(tokenize) 
data['nostop'] = data['tokenized'].apply(remove_stopwords)
data['stemmed'] = data['nostop'].apply(stemming)
data['lemmatized'] = data['nostop'].apply(lemmatizing)

data.head()

Unnamed: 0,spam,text,cleaned,tokenized,nostop,stemmed,lemmatized
1,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n..."
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
4,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"


In [11]:
data['spam'] = np.where(data['spam'] == 'ham', 0, 1) #where 0 is ham and 1 is spam
data.head(20)

Unnamed: 0,spam,text,cleaned,tokenized,nostop,stemmed,lemmatized
1,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[go, jurong, point, crazy, available, bugis, n..."
2,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
3,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
4,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
5,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"
6,1,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling its been 3 weeks now...,"[freemsg, hey, there, darling, its, been, 3, w...","[freemsg, hey, darling, 3, weeks, word, back, ...","[freemsg, hey, darl, 3, week, word, back, id, ...","[freemsg, hey, darling, 3, week, word, back, i..."
7,0,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[even, brother, like, speak, treat, like, aid,..."
8,0,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnaminun...,"[as, per, your, request, melle, melle, oru, mi...","[per, request, melle, melle, oru, minnaminungi...","[per, request, mell, mell, oru, minnaminungint...","[per, request, melle, melle, oru, minnaminungi..."
9,1,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have b...,"[winner, as, a, valued, network, customer, you...","[winner, valued, network, customer, selected, ...","[winner, valu, network, custom, select, receiv...","[winner, valued, network, customer, selected, ..."
10,1,Had your mobile 11 months or more? U R entitle...,Had your mobile 11 months or more U R entitled...,"[had, your, mobile, 11, months, or, more, u, r...","[mobile, 11, months, u, r, entitled, update, l...","[mobil, 11, month, u, r, entitl, updat, latest...","[mobile, 11, month, u, r, entitled, update, la..."


In [12]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('spam', axis=1), data['spam'], test_size=0.3, random_state=0)

In [14]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
vectorizer = tfidf_vect.fit(X_train['text'])

tfidf_train = vectorizer.transform(X_train['text'])
X_train_vect =  pd.DataFrame(tfidf_train.toarray(), columns = vectorizer.get_feature_names())

tfidf_test = vectorizer.transform(X_test['text'])
X_test_vect =  pd.DataFrame(tfidf_test.toarray(), columns = vectorizer.get_feature_names())

X_train_vect.head()

Unnamed: 0,Unnamed: 1,0,008704050406,01223585236,01223585334,02,020603,0207,02070836089,02072069400,...,zed,zero,zhong,zindgi,zoe,zouk,zyada,ü,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.139735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Training the model**

In [15]:
modal_class_label = stats.mode(y_train)[0]
y_train_preds_naive = [int(modal_class_label) for row in X_train.iloc[:,0]]
y_test_preds_naive = [int(modal_class_label) for row in X_test.iloc[:,0]]

# train accuracy score
y_true = y_train
y_pred = y_train_preds_naive

train_accuracy_naive = accuracy_score(y_true, y_pred) * 100
print(f"The Naive Baseline Model's accuracy on train data is {train_accuracy_naive:.2f}%.")

# test accuracy score
y_true = y_test
y_pred = y_test_preds_naive

test_accuracy_naive = accuracy_score(y_true, y_pred) * 100
print(f"The Naive Baseline Model's accuracy on test data is {test_accuracy_naive:.2f}%.")

The Naive Baseline Model's accuracy on train data is 86.11%.
The Naive Baseline Model's accuracy on test data is 87.46%.


In [16]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
rf_model = rf.fit(X_train_vect, y_train)

#test accuracy score
y_true = y_train
y_pred = rf_model.predict(X_train_vect)

train_accuracy_ranfor = accuracy_score(y_true, y_pred) * 100
print(f"The RandomForestClassifier's accuracy on train data is {train_accuracy_ranfor:.2f}%.")

#test accuracy score
y_true = y_test
y_pred = rf_model.predict(X_test_vect)

test_accuracy_ranfor = accuracy_score(y_true, y_pred) * 100
print(f"The RandomForestClassifier's accuracy on test data is {test_accuracy_ranfor:.2f}%.")

The RandomForestClassifier's accuracy on train data is 100.00%.
The RandomForestClassifier's accuracy on test data is 97.73%.


In [17]:
#Logistic Regression Prediction Model
from sklearn.linear_model import LogisticRegression

#logreg = LogisticRegression(solver='liblinear')
#logreg.fit(X_train_vect, y_train)
#y_train_preds_logreg = logreg.predict(X_train_vect)
#y_test_preds_logreg = logreg.predict(X_test_vect)

# train accuracy score
#y_true = y_train
#y_pred = y_train_preds_logreg

#train_accuracy_logreg = accuracy_score(y_true, y_pred) * 100
#print(f"The LogReg Model's accuracy on train data is {train_accuracy_logreg:.2f}%.")

# test accuracy score
#y_true = y_test
#y_pred = y_test_preds_logreg

#test_accuracy_logreg = accuracy_score(y_true, y_pred) * 100
#print(f"The LogReg Model's accuracy on test data is {test_accuracy_logreg:.2f}%.")

In [18]:
from sklearn.tree import DecisionTreeClassifier

#dtree = DecisionTreeClassifier(random_state=0)
#dtree.fit(X_train_vect, y_train)
#y_train_preds_dtree = dtree.predict(X_train_vect)
#y_test_preds_dtree = dtree.predict(X_test_vect)

# train accuracy score
#y_true = y_train
#y_pred = y_train_preds_dtree

#train_accuracy_dtree = accuracy_score(y_true, y_pred) * 100
#print(f"The DTree Model's accuracy on train data is {train_accuracy_dtree:.2f}%.")

# test accuracy score
#y_true = y_test
#y_pred = y_test_preds_dtree

#test_accuracy_dtree = accuracy_score(y_true, y_pred) * 100
#print(f"The DTRee Model's accuracy on test data is {test_accuracy_dtree:.2f}%.")

#### Example of different predictions

In [19]:
def predict(text): #where 0 is ham and 1 is spam
    test = [[text]]
    df = pd.DataFrame(test, columns = ['text'])
    df_test = vectorizer.transform(df['text'])
    array_test = pd.DataFrame(df_test.toarray())
    y_pred = rf_model.predict(array_test)
    
    if(y_pred[0] == 1): #where 0 is ham and 1 is spam
        print("Message Type: " + "spam" )
    
    elif(y_pred[0] == 0): #where 0 is ham and 1 is spam
        print("Message Type: " + "ham" )

In [20]:
predict("hello hello u wan come over to my hse tmr?")

Message Type: ham


In [21]:
predict("free entry chance win urgent next week")

Message Type: spam


In [22]:
predict("Dear client, your Trading Account has been successfully opened. Please visit www.tigerbrokers.com.sg, select My Account and complete Deposit Notification.")

Message Type: ham


In [23]:
predict("Top Up 100% bonus $50 free $50 DOUBLE BONUS FOR NEW MEMBERS UP TO 300 SGD FOR CASING SP0RT,H0URSE,l0TTERY Register Now Free Spin www.ads4u.asia/l/3l4p")

Message Type: spam


In [24]:
predict("Hey bro/sis, looking for betting account? -Soccer -Casion&Slots -HorseRace 0 deposit! 10-30% bonus/rebate!")

Message Type: ham


In [25]:
predict("WoW Power lah Credit Acc -NO NEED DEPOSIT -REBATE 10% Cash Acc -Welcome Bonus 50% >LiveC@sino >Sl0t > SPORT wa.me/+6581524963 GOOD AND FAST SERVICE")

Message Type: spam


In [26]:
predict("Afternoon I'm Abby from SG. Do you like traveling? I got some good opportunity to get some passive income jb introduce to you. If you are interested just simplty reply me yes and i will send you more detail.")

Message Type: spam


In [27]:
predict("Hello do you free and easy passive income? I have a job for you with very little work, reply to me for more detail")

Message Type: spam


#### Example of deleting telegram chat upon scam message detection with use of telethon

In [28]:
from telethon.tl.functions.messages import DeleteChatUserRequest

def deleteChat(text):
    if(predict(text)==1):
        with TelegramClient('name', api_id, api_hash) as client:
            client(functions.messages.DeleteChatUserRequest(
            chat_id=chat_id,
            user_id='me'))

            client.run_until_disconnected()