# Spam Classification Using NLP 

In [None]:
# Importing the Libaries 

# Data Loading 
import pandas as pd

#Text Cleaning 
import re 

# Text Preprocessing 
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Vectorizers 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Train - test split 
from sklearn.model_selection import train_test_split

# Model buidling 
from lazypredict.Supervised import LazyClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

#Visualization 
import seaborn as sns 


In [None]:
# Loading the dataset 
data = pd.read_csv('Data/SMSSpamCollection', sep='\t', names=['labels', 'message'])
data.head()

The labels are Spam and ham (not spam) 

In [None]:
# Visualizing the number of spams and hams in the dataset using a countplot 
sns.countplot(x = 'labels', data = data);
data['labels'].value_counts()

## Stemmer + Bag of Words Combination

Next Step - 
- Remove all characters that aren't alphabets such as punctuation marks, numbers etc. 
- Convert the text to lower case 
- Create word tokens aka split the messages into words 
- Remove all stopwords by using the stopwords() in nltk 
- Carry out on stemming on these words 
- Join the stemmed words back to create sentences 
- Add these sentences to the corpus. 

Note - the corpus is a list containing all the sentences in the dataset and will be used later to create the vector using TFIDF

In [None]:
# Text Cleaning and preprocessing 

stemmer = PorterStemmer()
corpus = []
for i in range(len(data)): 
    processed_data = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    processed_data = processed_data.lower().split()

    processed_data = [stemmer.stem(word) for word in processed_data if not word in stopwords.words('english')]
    processed_data = ' '.join(processed_data)
    corpus.append(processed_data)


In [None]:
# Creating dummies for the variable - label with 1 for spam and 0 for ham 
y = pd.get_dummies(data['labels'])
y = y.iloc[:,1].values


# Splitting my data into train and test before creating Bag of Words 
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.2, random_state = 0)

In [None]:
# Creating Bag of Words 
vectorizer = CountVectorizer(max_features=2500)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.fit_transform(X_test).toarray()

# Model buidling 
clf = GaussianNB().fit(X_train,y_train)
y_predicted = clf.predict(X_test)

confusion_mat = confusion_matrix(y_predicted, y_test)
accuracy = accuracy_score(y_predicted, y_test)

print(f'Confusion Matrix \n {confusion_mat}')
print(f'Accuracy Score is = {accuracy}')

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

In [None]:
models

## Lemmatization + TFIDF Vectorizer 

In [None]:
# Text Cleaning and preprocessing 

lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(len(data)): 
    processed_data = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    processed_data = processed_data.lower().split()

    processed_data = [lemmatizer.lemmatize(word) for word in processed_data if not word in stopwords.words('english')]
    processed_data = ' '.join(processed_data)
    corpus.append(processed_data)


In [None]:
# Creating dummies for the variable - label with 1 for spam and 0 for ham 
y = pd.get_dummies(data['labels'])
y = y.iloc[:,1].values


# Splitting my data into train and test before creating TFIDF
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.2, random_state = 0)

In [None]:
# Creating Bag of Words 
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

# Model buidling 
clf = RandomForestClassifier().fit(X_train,y_train)
y_predicted = clf.predict(X_test)

confusion_mat = confusion_matrix(y_predicted, y_test)
accuracy = accuracy_score(y_predicted, y_test)

print(f'Confusion Matrix \n {confusion_mat}')
print(f'Accuracy Score is = {accuracy}')

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

In [None]:
models