In [17]:
import string

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
df = pd.read_csv('spam_ham_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [8]:
df['text'] = df['text'].apply(lambda x:x.replace('\n',''))

In [11]:
df.text.iloc[2]

"Subject: neon retreat\rho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time !\ri know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute .\ron the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about .\ri think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a 

In [30]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\rth...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r( see a...",0
2,3624,ham,"Subject: neon retreat\rho ho ho , we ' re arou...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\rthis deal is to ...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\rthe transport v...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\rhpl ...,0
5168,2933,ham,Subject: calpine daily gas nomination\r>\r>\rj...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [13]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
  text = df['text'].iloc[i].lower()
  text = text.translate(str.maketrans('','', string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  text = ' '.join(text)
  corpus.append(text)

In [14]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291\rthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\rflow data provided by daren } .\rplease override pop ' s daily volume { presently zero } to reflect daily\ractivity you can obtain from gas control .\rthis change is needed asap for economics purposes ."

In [15]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [19]:
vectorizer = CountVectorizer()

x = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .02)


In [20]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(x_train, y_train)

In [21]:
clf.score(x_test, y_test)

0.9807692307692307

In [25]:
email_to_classify = df.text.values[10]
email_to_classify

"Subject: vocable % rnd - word asceticism\rvcsc - brand new stock for your attention\rvocalscape inc - the stock symbol is : vcsc\rvcsc will be our top stock pick for the month of april - stock expected to\rbounce to 12 cents level\rthe stock hit its all time low and will bounce back\rstock is going to explode in next 5 days - watch it soar\rwatch the stock go crazy this and next week .\rbreaking news - vocalscape inc . announces agreement to resell mix network\rservices\rcurrent price : $ 0 . 025\rwe expect projected speculative price in next 5 days : $ 0 . 12\rwe expect projected speculative price in next 15 days : $ 0 . 15\rvocalscape networks inc . is building a company that ' s revolutionizing the\rtelecommunications industry with the most affordable phone systems , hardware ,\ronline software , and rates in canada and the us . vocalscape , a company with\rglobal reach , is receiving international attention for the development of voice\rover ip ( voip ) application solutions , inc

In [27]:
email_text = email_to_classify.lower().translate(str.maketrans('','', string.punctuation)).split()
email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

x_email = vectorizer.transform(email_corpus)

In [28]:
clf.predict(x_email)

array([1])

In [29]:
df.label_num.iloc[10]

np.int64(1)