In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize , sent_tokenize
import string
from textblob import TextBlob as tb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB , MultinomialNB

In [542]:
df = pd.read_csv('dataset/spam.csv')

In [543]:
df.drop(df.iloc[:,2:],inplace= True , axis= 1)
df.rename(columns= {'v1':'target','v2':'text'}, inplace= True)
df.drop_duplicates(inplace= True , keep= 'first')

In [544]:
df.sample(5)

Unnamed: 0,target,text
30,ham,"Wait that's still not all that clear, were you..."
3183,ham,Good morning pookie pie! Lol hope I didn't wak...
2482,ham,Pansy! You've been living in a jungle for two ...
4808,ham,"Don't worry though, I understand how important..."
1098,ham,NO GIFTS!! You trying to get me to throw mysel...


In [545]:
df.head(30)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## text preprocessing

In [547]:
df.head(100)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
95,spam,Your free ringtone is waiting to be collected....
96,ham,Watching telugu movie..wat abt u?
97,ham,i see. When we finish we have loads of loans t...
98,ham,Hi. Wk been ok - on hols now! Yes on for a bit...


In [14]:

ps = PorterStemmer()
def text_preprocessing(text):
    
    
    txt = text.lower()
    # print(txt)
    
    # removing punctuations
    for punc in string.punctuation:
        txt = txt.replace(punc , '')
    
    #removing b'\xef\xbf\xbd' , an emoji present
    txt = (txt.encode().replace(b'\xef\xbf\xbd' , b'')).decode()
    
    #removing stopwords
    txt_ = []
    txt = txt.split()
    # print(len(txt), '\n')
    for i in txt:
        if i in stopwords.words('english'):
            pass
        else:
            txt_.append(i)
    
    # appling stemming and correcting spellings
    for i in txt_:
        
        correct_spell = str(tb(ps.stem(i)).correct())
        txt_[txt_.index(i)] = correct_spell

    txt = ' '.join(txt_)
    
    txt_.clear()
    
    
    return txt

import time

t = time.time()
df['text'] = df['text'].apply(text_preprocessing)
print((time.time() - t) )

In [691]:
def chnge(target):
    if target == 'spam':
        return 1
    elif target == 'ham':
        return 0

df['target'] = df['target'].apply(chnge)

## feature extraction

In [2]:
df1 = pd.read_csv('ham_spam_text_preprocessed.csv')
df1.drop(df1.iloc[:,0:1] , axis = 1 , inplace = True)
df1.dropna(inplace = True)

In [3]:
df1[:5160]

Unnamed: 0,target,text
0,0,go point crazy avail bug n great world la e bu...
1,0,ok war joke u
2,1,free entry 2 come win cup final tut must may 2...
3,0,u dun say early u c already say
4,0,ah dont think go us live around though
...,...,...
5160,0,and 6 like dat for
5161,0,dont wait til least wednesday see get
5162,0,hut let
5163,1,remind of get 250 pound free call credit detai...


In [4]:
cv = CountVectorizer()
bow = cv.fit_transform(df1['text']).toarray()
new_df = pd.DataFrame(bow)
new_df['target'] = df1['target']
new_df.dropna(inplace= True)

In [5]:
x_train , x_test , y_train , y_test = train_test_split(new_df.iloc[:,:-1].values , new_df['target'].values)

In [6]:
nb = MultinomialNB()
nb.fit(x_train , y_train)

In [7]:
nb.score(x_test , y_test)

0.8294573643410853

## rnf

In [8]:
from sklearn.ensemble import RandomForestClassifier
rnf = RandomForestClassifier(n_estimators= 10)
rnf.fit(x_train , y_train)

In [9]:
rnf.score(x_test , y_test)

0.8503875968992248

## svc

In [10]:
from sklearn.svm import SVC
svc = SVC()
# svc.fit(x_train , y_train)
# svc.score(x_test , y_test)

In [11]:
from sklearn.ensemble import VotingClassifier
estimators = [('rnf',rnf) , ('naive',nb) ,('svc',svc)]

vc = VotingClassifier(estimators = estimators)
vc.fit(x_train , y_train)

In [12]:
vc.score(x_test , y_test)

0.8713178294573644

# so now lets check for a mail from the rich dad website , which in my device is a ham mail(0)

In [15]:
txt = "Did you turn the lights on today? Maybe drove a car? Use a computer or phone? You use and need energy. Civilization requires energy. Without energy, fossil fuels, and renewables, civilization would collapse. The United States needs oil, lots of oil. Civilization cannot grow without energy. If energy became scarce or more expensive, civilization would crumble. Hence the government wants entrepreneurs to provide energy, so there are tax incentives for oil and gas exploration. Today, the United States is less dependent upon foreign oil. This creates some great opportunities for investing in the United States. Below is a great partner’s way to invest in energy. It’s a little different and can be used by anyone… not just the rich. Interested? Read below. Rich Dad"
new_txt = text_preprocessing(txt)
q = cv.transform([new_txt]).toarray()

In [16]:
vc.predict(q)

array([0.])