In [1]:
import pandas as pd

#docs = pd.read_excel('SMSSpamCollection.xls',header=None,names=['Class', 'SMS']) 
docs = pd.read_table('SMSSpamCollection', header=None, names=['Class', 'sms'])

#classifier in column 1, sms in column 2.
docs.head()


Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# counting spam and ham instances
# df.column_name.value_counts() - gives no. of unique inputs in the columns

ham_spam=docs.Class.value_counts()
ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [3]:
# mapping labels to 0 and 1
docs['label'] = docs.Class.map({'ham':0, 'spam':1})

In [4]:
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
X = docs.sms
y = docs.label
print(X.shape)
print(y.shape)


(5572,)
(5572,)


In [6]:
# splitting into test and train

from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.20, random_state=42)

In [7]:
X_train.head()

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: sms, dtype: object

In [8]:
#print(X_train.shape)

In [9]:
#print(y_train.shape)

In [10]:
#print(X_test.shape)

In [11]:
#print(y_test.shape)

Imagine breaking X in individual words and putting them all in a bag. Then we pick all the unique words from the bag one by one and make a dictionary of unique words.

This is called vectorization of words. We have the class CountVectorizer() in scikit learn to vectorize the words. Let us first see it in action before explaining it further.

In [12]:
# vectorizing the sentences; removing stop words
#from sklearn.feature_extraction.text import CountVectorizer
#vect = CountVectorizer(stop_words='english')

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

Here vect is an object of class CountVectorizer(). This has a method called fit() which converts a corpus of documents into a vector of unique words as shown below.

In [14]:
vect.fit(X_train)
vect.vocabulary_

{'reply': 5688,
 'to': 6889,
 'win': 7475,
 '100': 258,
 'weekly': 7397,
 'where': 7438,
 'will': 7472,
 'the': 6774,
 '2006': 354,
 'fifa': 2806,
 'world': 7556,
 'cup': 2107,
 'be': 1272,
 'held': 3365,
 'send': 5981,
 'stop': 6461,
 '87239': 695,
 'end': 2569,
 'service': 6000,
 'hello': 3370,
 'sort': 6305,
 'of': 4855,
 'out': 4977,
 'in': 3604,
 'town': 6960,
 'already': 925,
 'that': 6771,
 'so': 6253,
 'dont': 2396,
 'rush': 5826,
 'home': 3442,
 'am': 935,
 'eating': 2505,
 'nachos': 4651,
 'let': 4058,
 'you': 7663,
 'know': 3927,
 'eta': 2628,
 'how': 3488,
 'come': 1903,
 'guoyang': 3259,
 'go': 3141,
 'tell': 6712,
 'her': 3385,
 'then': 6785,
 'told': 6905,
 'hey': 3393,
 'sathya': 5882,
 'till': 6857,
 'now': 4809,
 'we': 7373,
 'dint': 2324,
 'meet': 4393,
 'not': 4798,
 'even': 2639,
 'single': 6148,
 'time': 6858,
 'can': 1630,
 'saw': 5895,
 'situation': 6165,
 'orange': 4946,
 'brings': 1510,
 'ringtones': 5764,
 'from': 2994,
 'all': 912,
 'chart': 1734,
 'heroes':

Countvectorizer() has converted the documents into a set of unique words.

Stop Words

We can see a few trivial words such as 'and','is','of', etc. These words don't really make any difference in classyfying a document. These are called 'stop words'. So we would like to get rid of them.

We can remove them by passing a parameter stop_words='english' while instantiating Countvectorizer() as follows:

In [15]:
# removing the stop words
vect = CountVectorizer(stop_words='english')


In [16]:
 vectorizer = vect.fit(X)
# X_train_dtm = vect.transform(X_train)


In [17]:
vect.vocabulary_
# printing the vocabulary

{'jurong': 4248,
 'point': 5779,
 'crazy': 2282,
 'available': 1284,
 'bugis': 1719,
 'great': 3555,
 'world': 8285,
 'la': 4374,
 'buffet': 1717,
 'cine': 2007,
 'got': 3515,
 'amore': 1064,
 'wat': 8083,
 'ok': 5377,
 'lar': 4410,
 'joking': 4216,
 'wif': 8191,
 'oni': 5403,
 'free': 3280,
 'entry': 2889,
 'wkly': 8243,
 'comp': 2123,
 'win': 8203,
 'fa': 3018,
 'cup': 2341,
 'final': 3135,
 'tkts': 7569,
 '21st': 412,
 '2005': 403,
 'text': 7437,
 '87121': 794,
 'receive': 6158,
 'question': 6052,
 'std': 7077,
 'txt': 7754,
 'rate': 6104,
 'apply': 1141,
 '08452810075over18': 77,
 'dun': 2751,
 'say': 6496,
 'early': 2770,
 'hor': 3840,
 'nah': 5124,
 'don': 2663,
 'think': 7492,
 'goes': 3479,
 'usf': 7892,
 'lives': 4562,
 'freemsg': 3287,
 'hey': 3757,
 'darling': 2398,
 'week': 8129,
 'word': 8276,
 'like': 4512,
 'fun': 3342,
 'tb': 7372,
 'xxx': 8350,
 'chgs': 1961,
 'send': 6584,
 '50': 616,
 'rcv': 6116,
 'brother': 1689,
 'speak': 6959,
 'treat': 7687,
 'aids': 997,
 'pate

In [18]:
# printing feature names
print(vect.get_feature_names())
print(len(vect.get_feature_names()))

8444


So our final dictionary is made of 8444 words (after discarding the stop words). Now, to do classification, we need to represent all the documents with respect to these words in the form of features.

Every document will be converted into a feature vector representing presence of these words in that document. Let's convert each of our training documents in to a feature vector.

In [19]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed =vect.transform(X_test)

In [20]:
# note that the type is transformed matrix
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 298)	1
  (0, 404)	1
  (0, 798)	1
  (0, 2341)	1
  (0, 2853)	1
  (0, 3116)	1
  (0, 3729)	1
  (0, 6260)	1
  (0, 6584)	1
  (0, 6606)	1
  (0, 7112)	1
  (0, 8133)	1
  (0, 8203)	1
  (0, 8285)	1
  (1, 2666)	1
  (1, 2785)	1
  (1, 2925)	1
  (1, 3734)	1
  (1, 3813)	1
  (1, 4345)	1
  (1, 4484)	1
  (1, 5121)	1
  (1, 6420)	1
  (1, 6925)	1
  (1, 7653)	1
  :	:
  (4452, 2783)	1
  (4452, 3515)	1
  (4452, 3761)	1
  (4452, 4537)	1
  (4452, 8246)	1
  (4452, 8263)	1
  (4453, 4422)	1
  (4453, 4566)	1
  (4453, 6924)	1
  (4454, 3310)	1
  (4454, 3716)	1
  (4454, 5854)	1
  (4454, 6142)	1
  (4454, 6930)	1
  (4454, 6931)	1
  (4455, 4216)	1
  (4455, 5302)	1
  (4455, 6603)	1
  (4455, 7598)	1
  (4456, 2546)	1
  (4456, 4250)	1
  (4456, 5132)	1
  (4456, 6496)	1
  (4456, 6894)	1
  (4456, 7349)	1


In [21]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
naive_model = mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
#y_pred_proba =mnb.predict_proba(X_test_transformed)

#print(y_pred_proba)
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.9829596412556054

In [22]:
y_pred_class

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[953,  13],
       [  6, 143]])

In [24]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

[[953  13]
 [  6 143]]


In [25]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

sensitivity 0.959731543624161


In [26]:
specificity = TN / float(TN + FP)

print("specificity",specificity)

specificity 0.9865424430641822


In [27]:
precision = TP / float(TP + FP)

print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

precision 0.9166666666666666
0.9166666666666666


In [28]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

precision 0.9166666666666666
PRECISION SCORE : 0.9166666666666666
RECALL SCORE : 0.959731543624161
F1 SCORE : 0.9377049180327869


In [29]:
# Saving the multinomial nb model as a pickle file
import joblib
fin_model = naive_model
joblib.dump(fin_model, 'naive_model.joblib')
joblib.dump(vectorizer, 'CountVectorizer.joblib')

['CountVectorizer.joblib']