In [24]:
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
df = pd.read_csv("/content/drive/MyDrive/data/Spam mails/spam_ham_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


<h3>1)Tokenizing the sentences into words</h3>

Tokenize the rows to sentences

In [27]:
sentences = []
for i in df['text']:
  temp_sentence = nltk.sent_tokenize(i)
  for j in temp_sentence:
    words = nltk.sent_tokenize(j)
    for k in range(len(words)):
      words[k] = words[k].lower()
      words[k] = re.sub(r'\W', ' ', words[k])
      words[k] = re.sub(r'\s+', ' ', words[k])
      sentences.append(words[k])

In [28]:
for i in range(10):
  print(sentences[i])

subject enron methanol meter 988291 this is a follow up to the note i gave you on monday 4 3 00 preliminary flow data provided by daren 
please override pop s daily volume presently zero to reflect daily activity you can obtain from gas control 
this change is needed asap for economics purposes 
subject hpl nom for january 9 2001 see attached file hplnol 09 xls hplnol 09 xls
subject neon retreat ho ho ho we re around to that most wonderful time of the year neon leaders retreat time 
i know that this time of year is extremely hectic and that it s tough to think about anything past the holidays but life does go on past the week of december 25 through january 1 and that s what i d like you to think about for a minute 
on the calender that i handed out at the beginning of the fall semester the retreat was scheduled for the weekend of january 5 6 but because of a youth ministers conference that brad and dustin are connected with that week we re going to change the date to the following week

Tokenize the sentences to words and create a dictionary with unique words and it's frequency

In [29]:
wrds = []
for i in sentences:
    words = nltk.word_tokenize(i)
    for word in words:
        wrds.append(word)
print(wrds[:10])
print(len(wrds))

['subject', 'enron', 'methanol', 'meter', '988291', 'this', 'is', 'a', 'follow', 'up']
855083


Removing stop words

In [30]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
len(wrds)

855083

In [32]:
stri = ""
for i in wrds:
  stri += (" " + i)

In [33]:
import re
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
stri = pattern.sub('', stri)

In [34]:
print(stri[:20])

 subject enron metha


In [35]:
wrds = stri.split()

In [36]:
print(len(wrds))

589111


<h3>2)Stemming</h3>

In [37]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [38]:
for i in wrds:
  i = ps.stem(i)

Deleting subject since every row has subject in it.

In [39]:
frequency ={}
for i in wrds:
 if i not in frequency.keys():
    frequency[i] = 1
 else:
    frequency[i] += 1

In [40]:
print(len(frequency))

50329


Deleting the words with a high frequency(stopwords)

In [41]:
import heapq
stopwords = heapq.nlargest(100, frequency, key=frequency.get)

In [42]:
for i in stopwords:
  frequency.pop(i)
print(len(frequency))

50229


<h3>3)Logistic Regression</h3>

In [43]:
y = df['label_num']

In [45]:
X = []
for data in df['text']:
    vector = []
    for word in stopwords:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)
X = np.asarray(X)

In [46]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [47]:
df_X = pd.DataFrame(X, columns = stopwords)

In [48]:
df_X.head()

Unnamed: 0,ect,subject,hou,enron,2000,com,please,gas,_,3,...,bob,production,flow,x,call,file,b,like,net,25
0,0,0,0,1,0,0,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size = 0.25)

In [50]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

LogisticRegression()

In [53]:
predictions = LR.predict(X_test)
score = LR.score(X_test,y_test)
print(score)

0.9404485692188709


In [52]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [54]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

[[856  55]
 [ 22 360]]
