##### Importing Package

In [1]:
import pandas as pd

##### Loading the dataset

In [2]:
data = pd.read_csv("data_part_a.csv")

#####  Look at the data / Data head 

In [3]:
data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##### Loading Features and Target

In [4]:
X = data.iloc[:, 1].values  
y = data.iloc[:, 0].values

In [5]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [6]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

##### Transforming using TF-IDF Transformer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
X_counts.shape

(5574, 8713)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)
X_tfidf.shape

(5574, 8713)

In [9]:
X_tfidf

<5574x8713 sparse matrix of type '<class 'numpy.float64'>'
	with 74169 stored elements in Compressed Sparse Row format>

##### Split Train and Test Sets

In [10]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20)

##### Fitting the model

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)

##### Predict Metric

In [12]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix  
y_pred = clf.predict(X_test)
print(np.mean(y_pred == y_test))
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

0.9506726457399103
[[954   0]
 [ 55 106]]
             precision    recall  f1-score   support

        ham       0.95      1.00      0.97       954
       spam       1.00      0.66      0.79       161

avg / total       0.95      0.95      0.95      1115



##### Let's try it out

In [13]:
x = ["I am going for shopping tomorrow. Wanna join?"]

In [14]:
c = count_vect.transform(x)
c_tfidf = tfidf_transformer.transform(c)
clf.predict(c_tfidf)

array(['ham'], dtype='<U4')

In [15]:
x = ["CONGRATS! Your mobile number has been selected in a Lucky draw. Call on 8447621901 to claim your guaranteed reward. Hurry!"]

In [16]:
c = count_vect.transform(x)
c_tfidf = tfidf_transformer.transform(c)
clf.predict(c_tfidf)

array(['spam'], dtype='<U4')