<a href="https://colab.research.google.com/github/bryancrigger/IST664---Natural-Language-Processing/blob/main/NLP_Project_Detecting_SPAM_Emails_with_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading in all the libraries

In [None]:
# Import libraries

# pip install graphviz
# pip install pydotplus

import numpy as np
import pandas as pd
import sklearn
import nltk
import re
import os
import random as rd
import graphviz
import matplotlib.pyplot as plt
import pydotplus


from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.tree import export_graphviz  
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline

from IPython.display import Image

# Text Cleaning & Processing

In [5]:
data = pd.read_csv('spam.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# Creating a variable for the emails that are labeled as "Spam" as "1", and those that are not-spam/"ham" as "0"
data['Spam'] = data['Category'].apply(lambda x:1 if x=='spam' else 0)
data.head(5)

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
# Splitting the testing and training data
X_train,X_test,y_train,y_test=train_test_split(data.Message,data.Spam,test_size=0.2)

# Feature Engineering

In [8]:
#CounterVectorizer Convert the text into matrics
model1=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())])

# Training the Model

In [9]:
model1.fit(X_train,y_train)

In [10]:
emails=[
    'Sounds great! Are you home now?',
    'You have apps approved and ready to use with Zoom',
    'Career Services - Employment Opportunity with 2U',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES']

In [11]:
# Predicting if the new emails are real or SPAM
model1.predict(emails)

array([0, 0, 1, 1])

In [12]:
print(model1.predict(emails)[0], emails[0], "\n")
print(model1.predict(emails)[1], emails[1], "\n")

0 Sounds great! Are you home now? 

0 You have apps approved and ready to use with Zoom 



In [13]:
model1.score(X_test,y_test)

0.9856502242152466

# Other Experiments

## Multinomial Naive Bayes

In [None]:
#this creates a function to stem the words
STEMMER=PorterStemmer()
print(STEMMER.stem("fishings"))

fish


In [None]:
def MY_STEMMER(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()
    words = [STEMMER.stem(w) for w in words]
    return words

In [None]:
#this is setting up the count vectorizer we will use later, removing stop words, and making things lowercase for consistency
MyVect_STEM=CountVectorizer(input='filename',
                        analyzer = 'word',
                        stop_words='english',
                        tokenizer=MY_STEMMER,
                        lowercase = True
                        )

In [None]:
#this function removes numbers that will be used later
def RemoveNums(SomeDF):
    print("Running Remove Numbers function....\n")
    temp=SomeDF
    MyList=[]
    for col in temp.columns:
        Logical2=str.isalpha(col)
        if(Logical2==False):
            MyList.append(str(col))          
    temp.drop(MyList, axis=1, inplace=True)
       
    return temp

In [None]:
#setting up an empty data frame to put the vectorized words into
FinalDF_STEM=pd.DataFrame()

In [None]:
#below is where you read in the files
#you need to put the two folders ham and spam in another folder (I titled mine corpus)
#I put spam and ham in corpus so the path is C:/Users/bradr/Desktop/g/corpus/ so just replace path below with #your own and everything else should work the same
for name in ["spam","ham"]:

    builder=name+"DF"    
    builderB=name+"DFB"
   
    path="C:/Users/bradr/Desktop/g/corpus/"+name
   
    FileList=[]
    for item in os.listdir(path):
        next1=path+ "/" + item
        #appends each file (txt file) into a list
        FileList.append(next1)
        #this is where the vectorization happens
        X1=MyVect_STEM.fit_transform(FileList)
        #getting all of the vectorized word names
        ColumnNames1=MyVect_STEM.get_feature_names_out()
        NumFeatures1=len(ColumnNames1)
        #building a column of labels i.e. spam or ham
        builderS=pd.DataFrame(X1.toarray(),columns=ColumnNames1)
        builderS["Label"]=name
        #adding the labels column to the rest of the vectorized data frame
        FinalDF_STEM=pd.concat([FinalDF_STEM, builderS], ignore_index=True)


In [None]:
print(FinalDF_STEM.head())
print(FinalDF_STEM['Label'])

In [None]:
#this replaces all columns that have NaN with 0
FinalDF_STEM=FinalDF_STEM.fillna(0)
#this is the use of the remove numbers function
FinalDF_STEM=RemoveNums(FinalDF_STEM)


#splitting the data into training and test data
TrainDF3, TestDF3 = train_test_split(FinalDF_STEM, test_size=0.3)
Test3Labels=TestDF3["Label"]

In [None]:
#removing the label column because you are using the labels column for the algorithm
TestDF3 = TestDF3.drop(["Label"], axis=1)
Train3Labels=TrainDF3["Label"]
TrainDF3 = TrainDF3.drop(["Label"], axis=1)

In [None]:
MyModelNB1= MultinomialNB()

In [None]:
#running MNB on the data and labels, using the labels and training data
MyModelNB1.fit(TrainDF3, Train3Labels)

In [None]:
#making a prediction from MNB
Prediction1 = MyModelNB1.predict(TestDF3)

In [None]:
print("\nThe prediction from NB is:")
print(Prediction1)
print("\nThe actual labels are:")
print(Test3Labels)

In [None]:
#confusion matrix
cnf_matrix3 = confusion_matrix(Test3Labels, Prediction1)

In [None]:
print("\nThe confusion matrix is:")
print(cnf_matrix3)

### prediction probabilities
## columns are the labels in alphabetical order
## The decinal in the matrix are the prob of being
## that label (ham or spam)
print(np.round(MyModelNB1.predict_proba(TestDF3),2))