In [3]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
le = LabelEncoder()

In [5]:
data = df.to_numpy()

In [6]:
X = data[:, 1]
y = data[:, 0]

In [7]:
X.shape, y.shape

((5572,), (5572,))

In [8]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [9]:
def getStem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]
    clean_review = ' '.join(stemmed_words)
    return clean_review

In [10]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [11]:
stemmed_doc = getDoc(X)

In [12]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [13]:
cv = CountVectorizer()

In [14]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [15]:
X = vc.todense()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [17]:
# NB from sklearn

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.977705274605764

In [32]:
messages = [
    """
    
Hi there,

This is a call to arms to help us improve the coding culture across Indian colleges.
We know this isn't an easy feat, and we need dedicated and determined people like you to take up the role of a College Chapter Leader and lead your university to a metaphorical "coding haven"

Being a part of a College Chapter comes with a ton of advantages. Not only will you be privy to a coding culture unlike anything you have ever seen before, you will also be able to interact with a bunch of coding fanatics like yourself and take your coding skills to another level. You can learn more about CodeChef College Chapters here.  

Additionally, as a Chapter Leader, this will be an excellent opportunity for you to:
Make an impact 
Network with fellow college leaders across the country
Build professional skills
CodeChef rewards strong performance, and being a Chapter Leader will open you up to advantages like:
Appreciation certificates from CodeChef for credible performances, 
Letters of recommendation from CodeChef, for an edge over everyone else,
The scope of connecting with industry professionals, expanding your network, and a lot more.
This is your chance to revolutionize the coding culture in Indian colleges, so start your own College Chapter now, and ensure that your college has an enviable coding culture. To know how to register, click here.""",
    """Join us today at 12:00 PM ET / 16:00 UTC for a Red Hat DevNation tech talk on AWS Lambda and serverless Java with Bill Burke.
Have you ever tried Java on AWS Lambda but found that the cold-start latency and memory usage were far too high? 
In this session, we will show how we optimized Java for serverless applications by leveraging GraalVM with Quarkus to 
provide both supersonic startup speed and a subatomic memory footprint.""",

    """We really appreciate your interest and wanted to let you know that we have received your application.
There is strong competition for jobs at Intel, and we receive many applications. As a result, it may take some time to get back to you.
Whether or not this position ends up being a fit, we will keep your information per data retention policies, 
so we can contact you for other positions that align to your experience and skill set.
"""
]

In [33]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [34]:
y_pred = model.predict(messages)
y_pred

array(['ham', 'spam', 'ham'], dtype='<U4')