In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# DATA LOADING

In [None]:
df = pd.read_csv('spam.csv')
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [None]:
df.head()

In [None]:
# Replace ham with 0 and spam with 1
df = df.replace(['ham','spam'],[0, 1]) 

In [None]:
df.head()

#### COUNTING NUMBER OF WORDS IN EACH TEXT

In [None]:
df['Count']=0
for i in np.arange(0,len(df.v2)):
    df.loc[i,'Count'] = len(df.loc[i,'v2'])

In [None]:
df.head()

In [None]:
# Total ham(0) and spam(1) messages
df['v1'].value_counts()

In [None]:
df.info()

In [None]:
corpus = []
ps = PorterStemmer()

In [None]:
# Original Messages

print (df['v2'][0])
print (df['v2'][1])

## Processing Messages

In [None]:
for i in range(0, 5572):

    # Applying Regular Expression
    
    '''
    Replace email addresses with 'email'
    Replace URLs with 'http'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phone'
    Replace numbers with 'numbr'
    '''
    msg = df['v2'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'email', df['v2'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'http', df['v2'][i])
    msg = re.sub('([A-Z]{3}|[A-Z]?[\$€¥])?\s?(\d{1,3}((,\d{1,3})+)?(.\d{1,3})?(.\d{1,3})?(,\d{1,3})?)', 'moneysymb', df['v2'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phone', df['v2'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', df['v2'][i])
    
    ''' Remove all punctuations '''
    msg = re.sub('[^\w\d\s]', ' ', df['v2'][i])
         
    # Each word to lower case
    msg = msg.lower()    
       
    # Splitting words to Tokenize
    msg = msg.split()    
    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    # Preparing WordVector Corpus
    corpus.append(msg)

In [None]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

# Applying Classification

- Input : Prepared Sparse Matrix
- Ouput : Labels (Spam or Ham)

In [None]:
y = df['v1']
print (y.value_counts())

print(y[0])
print(y[1])

### Encoding Labels

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

print(y[0])
print(y[1])

### Splitting to Training and Testing DATA

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size= 0.20, random_state = 0)

# Applying Guassian Naive Bayes Model

In [None]:
bayes_classifier = GaussianNB()
bayes_classifier.fit(xtrain, ytrain)

In [None]:
# Predicting
y_pred = bayes_classifier.predict(xtest)

## Results

In [None]:
# Evaluating
cm = confusion_matrix(ytest, y_pred)

In [None]:
cm

In [None]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, bayes_classifier.predict(xtest)))
print (classification_report(ytest, bayes_classifier.predict(xtest)))

# Applying Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=50)
dt.fit(xtrain, ytrain)

In [None]:
# Predicting
y_pred_dt = dt.predict(xtest)

## Results

In [None]:
# Evaluating
cm = confusion_matrix(ytest, y_pred_dt)

print(cm)

In [None]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, dt.predict(xtest)))
print (classification_report(ytest, dt.predict(xtest)))

# Final Accuracy

- **Decision Tree : 96.861%**
- **Guassian NB   : 87.085%**   