In [69]:
import numpy as np
import pandas as pd

In [70]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [71]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [76]:
df['v2'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

<h1>Steps to build the model</h1>

- Data Cleaning
- Exploratory Data Analysis
- Text Pre - Processing
- Model Building
- Evaluation
- Improvements
- Website
- Website Deployment

<h1>1.Data Cleaning</h1>

In [None]:
df.info()

In [5]:
# drop the last 3 columns as most of their values are null values
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

In [6]:
# renaming the columns
df.rename(columns={'v1':'target','v2':'text'}, inplace=True)

In [None]:
df.head()

In [8]:
# We use label encoder to convert categorical labels to numeric values (if the labels all belong to certain category say colours, then the labels would be 'red', 'green, 'blue' etc)
# The numeric values would be represented by the number of times they occur in the dataset
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [9]:
# will assign ham = 0 and spam = 1
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# duplicate values
df.duplicated().sum()

In [13]:
# drop all duplicate values and keeps only the first occurrence of the value
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.head()

<h1>2. Exploratory Data Analysis (EDA)</h1>

In [None]:
# checking percentage of spam and ham messages
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
# plt.pie is a pie chart
plt.pie(df['target'].value_counts(), labels = ['ham', 'spam'], autopct='%0.2f') # autopct will show representation in percentage and '0.2f' means 2 decimal places
plt.show() # hides all code before showing the chart

- data is imbalanced as ham is much greater than ham

In [19]:
# counting the number of characters and creating a new column
df['num_characters'] = df['text'].apply(len)

In [20]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('punkt_tab')

In [23]:
# fetching the number of words used per SMS and creating a new column
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [24]:
# number of sentences
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# fetching the stats for ham messages
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# fetching the stats for spam messages
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

- we notice that spam messages are generally longer than ham messeges

In [None]:
df.head()

In [30]:
import seaborn as sns

In [None]:
# showing that number of characters in spam messages are more than ham messages
plt.figure(figsize=(12,8))
sns.histplot(df[df['target'] == 0]['num_characters'], color='green')
sns.histplot(df[df['target'] == 1]['num_characters'], color = 'red')

- shows that the most ham messages (count) have less number of characters

- shows that the most spam messages have more number of characters(count is low as the number of spam messages are relatively low to ham)

In [None]:
# showing that number of words in spam messages are more than ham messages
plt.figure(figsize=(12,8))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'], color = 'red')

- shows that the most ham messages (count) have less number of words

- shows that the most spam messages have more number of words(count is low as the number of spam messages are relatively low to ham)

In [None]:
sns.pairplot(df, hue='target')

- we notice there are a few outliers (data not close to the others in similarity) in the dataset
- we need to resolve this

**We will keep num_characters as the column as all have huge relations with each other which will increase complexity of the model, hence we use the best option that has minimal correlation to all other columns which is num_character**

<h1>Data Preprocessing</h1>

- Lower case
- Tokenization
- Removing special characters
- Removing stop words and punctioation
- stemming

Hence we use a single function which will do all these steps at once

In [34]:
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [36]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer() # function to perform word stemming

In [37]:
import string # string package contains function string.punctuation which is used to remove all punctuation in our model

In [38]:
def transform_text(text):
    text = text.lower() # lowercase
    text = nltk.word_tokenize(text) # tokenization
    
    y= []
    for i in text:
        if i.isalnum():
            y.append(i)
    # this will remove all characters that are not alphabets or numbers (Eg: punctuation and whitespaces)
    
    text = y[:] # clones list y to text (text will not have what was stored in it earlier and will be an exact clone of y)
    y.clear() # emptying list y
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # removes stopwords and punctuation
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i)) # performs word stemming on word i
    
    return " ".join(y)  # returns list y as a string with a whitespace between each element

In [None]:
transform_text(df['text'][0]) # does data preprocessing on the first row of the dataset

In [40]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [42]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')

In [43]:
# will show us what words occur most often in spam messages
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=' '))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [45]:
# will show us what words occur most often in ham messages
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=' '))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

- now we want to take the 30 most common words that occur in spam messages

In [47]:
# here we take the transformed words in spam messages and store it in a list
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for words in msg.split():
        spam_corpus.append(words)

In [None]:
len(spam_corpus)

In [None]:
# we create a dictionary of frequency of each transformed word occurring in a spam message
from collections import Counter
Counter(spam_corpus)

In [None]:
# here we take the 30 words with most occurrence
Counter(spam_corpus).most_common(30)

In [None]:
# we convert these 30 words into a dataframe in order to process it like data
pd.DataFrame(Counter(spam_corpus).most_common(30))

In [None]:
spam_counter = Counter(spam_corpus).most_common(30)
df_spam = pd.DataFrame(spam_counter, columns=[0, 1])
sns.barplot(x=0, y=1, data=df_spam)
plt.xticks(rotation = 'vertical')
plt.show()

- repeating same process for ham words

In [53]:
# here we take the transformed words in ham messages and store it in a list
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for words in msg.split():
        ham_corpus.append(words)

In [None]:
# we create a dictionary of frequency of each transformed word occurring in a ham message
from collections import Counter
Counter(ham_corpus)

In [None]:
ham_counter = Counter(ham_corpus).most_common(30)
df_ham = pd.DataFrame(ham_counter, columns=[0, 1])
sns.barplot(x=0, y=1, data=df_ham)
plt.xticks(rotation = 'vertical')
plt.show()

<h1>4. Model Building / Model improvement</h1>

We use Naive bayes algorithm as best textual data processing happens using naive bayes

- from this we have found that **multinomial naive bayes** gives us the best results while using **tfidfVectorizer**

In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# we will use tfidf as that gives us a better precision score
cv = CountVectorizer() # ignore CountVectorizer
tfidf = TfidfVectorizer(max_features=3000) # takes 3000 of the most used words in the transformed text and creates model based on that

In [57]:
# using tfidf to transform the text to a vector array (text to numeric values)
x = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
x.shape

In [59]:
y = df['target'].values

In [None]:
y

In [61]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [62]:
# learning algorithms for naive bayes and importing the packages to calculate accuracy score, confusion matrix and precision score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
# using gaussian naive bayes
gnb.fit(x_train, y_train)
y_pred1 = gnb.predict(x_test)
print('accuracy score = ',accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print('precision score = ',precision_score(y_test, y_pred1))

In [None]:
# using multinomial naive bayes
mnb.fit(x_train, y_train)
y_pred2 = mnb.predict(x_test)
print('accuracy score = ',accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print('precision score = ',precision_score(y_test, y_pred2))

In [None]:
# using bernouli naive bayes
bnb.fit(x_train, y_train)
y_pred3 = bnb.predict(x_test)
print('accuracy score = ',accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print('precision score = ',precision_score(y_test, y_pred3))

**Here we find that multinomial naive bayes cause precision score is the best among all**

**Last Step**

In [66]:
# using pickle to import important files
import pickle

In [67]:
pickle.dump(tfidf, open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))