##Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.

Steps:

1.  Convert the words ham and spam to a binary indicator variable(0/1)

2.  Convert the txt to a sparse matrix of TFIDF vectors

3.  Fit a Naive Bayes Classifier

4.  Measure your success using roc_auc_score



In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score



In [2]:
df= pd.read_csv("../All_Codes_from_Instructor/CS570-master/data/SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [3]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#nominal to numeric
categorical_variables = ['spam']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    df[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(df[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    df = pd.concat([df, dummies], axis=1)
    df.drop([variable], axis=1, inplace=True)

In [5]:
#TFIDF Vectorizer, just like before
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [6]:
df.head()

Unnamed: 0,txt,spam_ham,spam_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [7]:
#in this case our dependent variable will be liked as 0 (didn't like the movie) or 1 (liked the movie)
y = df.spam_spam

In [8]:
#convert df.txt from text to features
X= vectorizer.fit_transform(df.txt)

In [9]:
#6918 observations x 2022 unique words.
print (y.shape)
print (X.shape)

(5572,)
(5572, 8587)


In [10]:
#Test Train Split as usual
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
#we will train a naive_bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
#We can test our model's accuracy like this:

roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.98589322144123448

In [13]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam_spam, dtype: uint8

In [14]:
new_email=np.array(["Free entry in 2 a wkly comp to win FA Cup fina"])

new_email_vector = vectorizer.transform(new_email)

print (clf.predict(new_email_vector))

[1]
