In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

## Loading the corpus ##

In [None]:
column_names_list = ["label", "text"]
input = pd.read_csv('../SMS_Spam_Collection/SMSSpamCollection', delimiter = "\t",names = column_names_list)
input

### comparing the length of texts in each class ###

In [None]:
data = input.copy(deep=True)
data["text length"] = data["text"].apply(len)

In [None]:
data[data["label"]== "ham"]["text length"].describe()

In [None]:
data[data["label"]== "ham"]["text length"].describe()

In [None]:
fig, axs = plt.subplots(1,2,figsize=(8.1, 5))

sns.histplot(data[data["label"]== "ham"]["text length"], color="green",ax=axs[0],stat="density")
axs[0].set_xlabel('Message Length')
axs[0].set_title('Ham Messages')
axs[0].set_xticks(np.arange(801, step=100))
sns.histplot(data[data["label"]== "spam"]["text length"], color="red",ax=axs[1],stat="density")
axs[1].set_xlabel('Message Length')
axs[1].set_ylabel('')
axs[1].set_title('Spam Messages')

In [None]:
fig, axes = plt.subplots(1,2,sharey='row')
data[data["label"]== "ham"]["text length"].hist(bins=20, edgecolor = 'black',ax=axes[0], density=True,color="blue")
data[data["label"]== "spam"]["text length"].hist(bins=20, edgecolor = 'black',ax=axes[1], density=True,color="red")
fig.suptitle('Message lengths in each class', fontsize=16,y=1)
axes[0].set_xlabel('Message Length')
axes[0].set_ylabel('density')
axes[1].set_xlabel('Message Length')
axes[1].set_ylabel('density')
axes[0].set_title('Ham')
axes[1].set_title('Spam')
fig.show()

## analyzing word statistics ##

In [None]:
# some cleaning
analysis2 = data.copy(deep=True)
analysis2["text"] = analysis2["text"].str.replace('&',"").str.replace('/',"").str.replace('>',"").str.replace('<',"").str.replace('"',"").str.replace("''","").str.replace("!","").str.replace("?","").str.replace(".","").str.replace(",","").str.replace(":","").str.replace(";","").str.replace("*","").str.replace("#","").str.replace("£","").str.replace(r"[0-9]","",regex=True).str.lower()


In [None]:
spam_words = analysis2[analysis2["label"] == "spam"]["text"].str.split(" ")
ham_words = analysis2[analysis2["label"] == "ham"]["text"].str.split(" ")
# all words in spam messagses
spam_words_concatenated = []
for x in spam_words: 
    spam_words_concatenated = spam_words_concatenated + x
# all words in ham messagses
ham_words_concatenated = []
for x in ham_words: 
    ham_words_concatenated = ham_words_concatenated + x
# create a series containing the words in each class
ham_words_concatenated = pd.Series(ham_words_concatenated).str.strip().replace('', np.nan).dropna()
spam_words_concatenated = pd.Series(spam_words_concatenated).str.strip().replace('', np.nan).dropna()

In [None]:
unwanted_words = list(set(list(unwanted_words) + ["one","lor","about","can","have","-","only","just","+","p","our","now","from","going","i'll","ü","he","there","do","was","its","then","how","am","with","or","ok","no","this","what","when","we","ur","ltgt","be","if","i'm",'i', 'you', 'to', 'the', 'a', 'u', 'and', 'in', 'me', 'my','is','it','of','for','that','but','your','so','not','are','on','at']))

### printing most frequent words in each class ###

In [None]:
print("The 20 most frequent words in the spam messages after deleting pronouns, wh-questions etc.. are: ",list(spam_words_concatenated.value_counts(normalize=True)[~spam_words_concatenated.value_counts().index.isin(unwanted_words)].head(20).index) )

In [None]:
print("The 20 most frequent words in the ham messages after deleting pronouns, wh-questions etc.. are: ",list(ham_words_concatenated.value_counts(normalize=True)[~ham_words_concatenated.value_counts().index.isin(unwanted_words)].head(20).index) )

### calculating the average word-length in each class ###

In [None]:
print("the average word-length in the ham class is: ", round(ham_words_concatenated.apply(len).mean(),ndigits=2))
print("The average word-length after deleting pronouns, wh-questions etc.. is: ",round(ham_words_concatenated[~ham_words_concatenated.isin(unwanted_words)].apply(len).mean(),ndigits=2))

In [None]:
print("the average word-length in the spam class is: ", round(spam_words_concatenated.apply(len).mean(),ndigits=2))
print("The average word-length after deleting pronouns, wh-questions etc.. is: ",round(spam_words_concatenated[~spam_words_concatenated.isin(unwanted_words)].apply(len).mean(),ndigits=2))

### number of unique words in each class ###

In [None]:
print("The number of unique words in the ham class is: ",len(ham_words_concatenated.drop_duplicates()))
print("The number of unique words in the spam class is: ",len(spam_words_concatenated.drop_duplicates()))
print("The number of unique words in the whole dataset is: ",len(pd.concat([spam_words_concatenated,ham_words_concatenated]).drop_duplicates()))

In [None]:
import multidict as multidict
import os
import re
from PIL import Image
from os import path
from wordcloud import WordCloud


def getFrequencyDictForText(sentence):
    fullTermsDict = multidict.MultiDict()
    tmpDict = {}

    # making dict for counting frequencies
    for text in sentence.split(" "):
        if re.match("a|the|an|the|to|in|for|of|or|by|with|is|on|that|be", text):
            continue
        val = tmpDict.get(text, 0)
        tmpDict[text.lower()] = val + 1
    for key in tmpDict:
        fullTermsDict.add(key, tmpDict[key])
    return fullTermsDict


def makeImage(text):

    wc = WordCloud(width=1600, height=800,background_color="white", max_words=1000)
    # generate word cloud
    wc.generate_from_frequencies(text)

    # show
    plt.figure( figsize=(9,5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
# create a frequency image for words in ham sms
makeImage(getFrequencyDictForText(ham_words_concatenated[~ham_words_concatenated.isin(unwanted_words)].str.cat(sep=" ")))

In [None]:
# create a frequency image for words in spam sms
makeImage(getFrequencyDictForText(spam_words_concatenated[~spam_words_concatenated.isin(unwanted_words)].str.cat(sep=" ")))