In [5]:
import pandas as pd

docs = pd.read_excel('SMSSpamCollection.xls',header=None,names=['Class', 'SMS']) 
docs = pd.read_table('SMSSpamCollection', header=None, names=['Class', 'sms'])

docs.head()


# counting spam and ham (non spam) instances
ham_spam=docs.Class.value_counts()
ham_spam

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# mapping labels to 0 and 1
docs['label'] = docs.Class.map({'ham':0, 'spam':1})

In [92]:
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [97]:
X = docs.sms
y = docs.label
print(X.shape)
print(y.shape)


(5572,)
(5572,)


In [98]:
# splitting into test and train

from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.20, random_state=42)

In [99]:
X_train.head()

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: sms, dtype: object

In [100]:
print(X_train.shape)

(4457,)


In [101]:
print(y_train.shape)

(4457,)


In [102]:
print(X_test.shape)

(1115,)


In [103]:
print(y_test.shape)

(1115,)


In [None]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')
vect = CountVectorizer()

In [71]:
vect.fit(X_train)
vect.vocabulary_

{'reply': 5688,
 'to': 6889,
 'win': 7475,
 '100': 258,
 'weekly': 7397,
 'where': 7438,
 'will': 7472,
 'the': 6774,
 '2006': 354,
 'fifa': 2806,
 'world': 7556,
 'cup': 2107,
 'be': 1272,
 'held': 3365,
 'send': 5981,
 'stop': 6461,
 '87239': 695,
 'end': 2569,
 'service': 6000,
 'hello': 3370,
 'sort': 6305,
 'of': 4855,
 'out': 4977,
 'in': 3604,
 'town': 6960,
 'already': 925,
 'that': 6771,
 'so': 6253,
 'dont': 2396,
 'rush': 5826,
 'home': 3442,
 'am': 935,
 'eating': 2505,
 'nachos': 4651,
 'let': 4058,
 'you': 7663,
 'know': 3927,
 'eta': 2628,
 'how': 3488,
 'come': 1903,
 'guoyang': 3259,
 'go': 3141,
 'tell': 6712,
 'her': 3385,
 'then': 6785,
 'told': 6905,
 'hey': 3393,
 'sathya': 5882,
 'till': 6857,
 'now': 4809,
 'we': 7373,
 'dint': 2324,
 'meet': 4393,
 'not': 4798,
 'even': 2639,
 'single': 6148,
 'time': 6858,
 'can': 1630,
 'saw': 5895,
 'situation': 6165,
 'orange': 4946,
 'brings': 1510,
 'ringtones': 5764,
 'from': 2994,
 'all': 912,
 'chart': 1734,
 'heroes':

In [72]:
# removing the stop words
vect = CountVectorizer(stop_words='english')


In [73]:
vect.fit(X)
X_train_dtm = vect.transform(X_train)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [104]:
# printing feature names
print(vect.get_feature_names())
print(len(vect.get_feature_names()))

8444


In [77]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed =vect.transform(X_test)

In [78]:
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 298)	1
  (0, 404)	1
  (0, 798)	1
  (0, 2341)	1
  (0, 2853)	1
  (0, 3116)	1
  (0, 3729)	1
  (0, 6260)	1
  (0, 6584)	1
  (0, 6606)	1
  (0, 7112)	1
  (0, 8133)	1
  (0, 8203)	1
  (0, 8285)	1
  (1, 2666)	1
  (1, 2785)	1
  (1, 2925)	1
  (1, 3734)	1
  (1, 3813)	1
  (1, 4345)	1
  (1, 4484)	1
  (1, 5121)	1
  (1, 6420)	1
  (1, 6925)	1
  (1, 7653)	1
  :	:
  (4452, 2783)	1
  (4452, 3515)	1
  (4452, 3761)	1
  (4452, 4537)	1
  (4452, 8246)	1
  (4452, 8263)	1
  (4453, 4422)	1
  (4453, 4566)	1
  (4453, 6924)	1
  (4454, 3310)	1
  (4454, 3716)	1
  (4454, 5854)	1
  (4454, 6142)	1
  (4454, 6930)	1
  (4454, 6931)	1
  (4455, 4216)	1
  (4455, 5302)	1
  (4455, 6603)	1
  (4455, 7598)	1
  (4456, 2546)	1
  (4456, 4250)	1
  (4456, 5132)	1
  (4456, 6496)	1
  (4456, 6894)	1
  (4456, 7349)	1


In [117]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
#y_pred_proba =mnb.predict_proba(X_test_transformed)

#print(y_pred_proba)
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.9829596412556054

In [116]:
y_pred_class

array([0, 0, 0, ..., 0, 0, 0])

In [108]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[953,  13],
       [  6, 143]])

In [109]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

[[953  13]
 [  6 143]]


In [110]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

sensitivity 0.959731543624161


In [111]:
specificity = TN / float(TN + FP)

print("specificity",specificity)

specificity 0.9865424430641822


In [112]:
precision = TP / float(TP + FP)

print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

precision 0.9166666666666666
0.9166666666666666


In [113]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

precision 0.9166666666666666
PRECISION SCORE : 0.9166666666666666
RECALL SCORE : 0.959731543624161
F1 SCORE : 0.9377049180327869
