In [1]:
# Importing the Important Libraries
import pandas as pd
import nltk

In [2]:
# Importing the Data Preprocessing Libraries
import re as regex
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Importing Model Building Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [4]:
# Downloading the nltk Package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Importing the Dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['Label', 'Message'])

In [6]:
# Displaying First 10 rows of Dataset
df.head(10)

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## Data Preprocessing and Data Cleaning

In [7]:
# Checking for Null Values
df.isnull().sum()

Label      0
Message    0
dtype: int64

In [8]:
# Since there're no null values present in the file so we can proceed further and do other steps of data preprocessing
# Using Lemmatization
lemmatizer = WordNetLemmatizer()

In [9]:
# Creating an Empty List
corpus = []

# Removing all the Extra Chaacters and Symbols from the Text
for row_idx in range(len(df)):
    msg = regex.sub('[^a-zA-Z]', ' ', df['Message'][row_idx])
    msg = msg.lower()
    msg = msg.split()
    msg = [lemmatizer.lemmatize(word) for word in msg if word not in set(stopwords.words('English'))]
    msg = ' '.join(msg)
    corpus.append(msg)

In [10]:
# Generating Vectors out of Messages in Data Set and Generating Feature Set
tfidf_ve = TfidfVectorizer(max_features=3000)
X = tfidf_ve.fit_transform(corpus).toarray()

In [11]:
# Setting Encoding Type
enc = {
    'Label' : {'ham' : 0, 'spam' : 1}
}

In [12]:
# Encoding the Label Column and Extracting Class Set
y = df.replace(enc).iloc[:, 0].values

In [13]:
# Diving the Dataset into Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Model Building and Training

In [14]:
# Diving the Dataset into Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [15]:
# Fitting the Classifier
spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9772727272727273