In [2]:
#reading the data
import pandas as pd

docs = pd.read_table('SMSSpamCollection+(1)', header = None, names = ['Class', 'Sms'])

docs.head()

Unnamed: 0,Class,Sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#getting the value counts
ham_spam = docs.Class.value_counts()

ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [6]:
#getting the spam %

spam_percent = round(((ham_spam[1])/(ham_spam[1] + ham_spam[0]))*100,3)

print('Spam is: {}%'.format(spam_percent))

Spam is: 13.406%


In [7]:
#defining X and y

X = docs['Sms']
y = docs['Class']

print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [10]:
#train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.3)

In [13]:
X_train.head()

4393                       what are your new years plans?
216     Finally the match heading towards draw as your...
4471    Lemme know when I can swing by and pick up, I'...
3889                   ok....take care.umma to you too...
5030    I'd like to tell you my deepest darkest fantas...
Name: Sms, dtype: object

In [15]:
#vectorization and removing the stop words

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(stop_words = 'english')

In [16]:
#fitting the training data

vec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
vec.vocabulary_

{'new': 4272,
 'years': 6855,
 'plans': 4682,
 'finally': 2550,
 'match': 3930,
 'heading': 3018,
 'draw': 2202,
 'prediction': 4801,
 'lemme': 3650,
 'know': 3548,
 'swing': 5957,
 'pick': 4644,
 'free': 2663,
 'basically': 1119,
 'time': 6163,
 'semester': 5369,
 'ok': 4406,
 'care': 1485,
 'umma': 6353,
 'like': 3685,
 'tell': 6043,
 'deepest': 2000,
 'darkest': 1949,
 'fantasies': 2478,
 '09094646631': 222,
 'just': 3472,
 '60p': 538,
 'min': 4022,
 'stop': 5804,
 'texts': 6083,
 '08712460324': 103,
 'nat': 4218,
 'rate': 4985,
 'lmao': 3732,
 'fish': 2566,
 'memory': 3984,
 'need': 4243,
 'yahoo': 6841,
 'boys': 1324,
 'bring': 1354,
 'perf': 4592,
 'legal': 3644,
 'hiya': 3083,
 'comin': 1722,
 'bristol': 1358,
 'st': 5740,
 'week': 6637,
 'april': 940,
 'les': 3656,
 'got': 2862,
 'rudi': 5234,
 'yrs': 6882,
 'eve': 2383,
 'snoring': 5603,
 'drunk': 2226,
 'bak': 1092,
 'college': 1710,
 'work': 6769,
 'sends': 5374,
 'ink': 3289,
 'bath': 1123,
 'wamma': 6574,
 'laid': 3581,
 '

In [18]:
#transform the training and testing data

X_train_transformed = vec.transform(X_train)
X_test_transformed = vec.transform(X_test)

In [19]:
type(X_train_transformed)

scipy.sparse.csr.csr_matrix

In [28]:
#training the model and making predictions

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X_train_transformed, y_train)

y_pred_proba = mnb.predict_proba(X_test_transformed)

y_pred_class = mnb.predict(X_test_transformed)

In [33]:
#measuring the accuracy of the predictions
from sklearn.metrics import accuracy_score, confusion_matrix, pre

In [46]:
print('accuracy_score: {}'.format(accuracy_score(y_test,y_pred_class)))
print('\n')
print('confusion_matrix: {}'.format(confusion_matrix(y_test,y_pred_class)))


accuracy_score: 0.986244019138756


confusion_matrix: [[1434    8]
 [  15  215]]


In [54]:
y_pred_class

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [55]:
y_pred_proba

array([[9.94958999e-01, 5.04100100e-03],
       [9.99854067e-01, 1.45932813e-04],
       [9.07992380e-01, 9.20076203e-02],
       ...,
       [9.42539394e-01, 5.74606060e-02],
       [9.91184998e-01, 8.81500189e-03],
       [9.99990076e-01, 9.92429553e-06]])

In [58]:
#convert to pandas data frame
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['ham', 'spam'])

In [61]:
#converting to 1 or 0
def probability_conv(x):
    
    if x < 0.5:
        return 0
    else:
        return 1

In [69]:
#apply the function
y_pred_proba['ham'] = y_pred_proba['ham'].apply(probability_conv)
y_pred_proba['spam'] = y_pred_proba['spam'].apply(probability_conv)

In [71]:

y_pred_proba.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [73]:
def convert_cols(x):
    
    if x == 1:
        return 'ham'
    else:
        return 'spam'

In [74]:
y_pred_actual = y_pred_proba['ham'].apply(convert_colsert_cols)

In [75]:
y_pred_actual

0        ham
1        ham
2        ham
3        ham
4        ham
5        ham
6        ham
7        ham
8        ham
9        ham
10       ham
11       ham
12       ham
13       ham
14       ham
15       ham
16      spam
17       ham
18      spam
19       ham
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
1642    spam
1643     ham
1644     ham
1645     ham
1646     ham
1647     ham
1648     ham
1649    spam
1650     ham
1651     ham
1652     ham
1653     ham
1654    spam
1655     ham
1656     ham
1657     ham
1658     ham
1659     ham
1660    spam
1661     ham
1662     ham
1663     ham
1664    spam
1665     ham
1666     ham
1667    spam
1668     ham
1669     ham
1670     ham
1671     ham
Name: ham, Length: 1672, dtype: object

In [81]:
#getting the F1 score
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_actual))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1442
        spam       0.96      0.93      0.95       230

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.98      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



## so we can say that MultinomialNB gave us a pretty good accuracy