In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
uri = '/content/drive/My Drive/Colab Notebooks/York/ds1/data/SMSSpamCollection.txt'
df = pd.read_table(uri, sep='\t', header=None,names=['label', 'sms_message'])

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham':0,'spam':1})
# Number of rows and column
df.shape

(5572, 2)

In [5]:
df

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [6]:
# Use from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))



Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [None]:
y_test

1078    0
4028    0
958     0
4642    0
4674    0
       ..
3207    0
4655    0
1140    0
1793    1
1710    0
Name: label, Length: 1393, dtype: int64

In [8]:
# Data preprocessing with CountVectorizer()
# Import the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer

# create an instance of CountVectorizer called 'count_vector'. 
count_vector = CountVectorizer()

In [9]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data
training_data = count_vector.fit_transform(X_train)

# Transform testing data
testing_data = count_vector.transform(X_test)

In [13]:
str(training_data[0])

'  (0, 509)\t1\n  (0, 3181)\t1\n  (0, 5193)\t1\n  (0, 4781)\t1\n  (0, 3971)\t1\n  (0, 5479)\t1\n  (0, 3880)\t1\n  (0, 1572)\t1\n  (0, 4987)\t1\n  (0, 2864)\t2\n  (0, 3170)\t1\n  (0, 7424)\t1\n  (0, 4983)\t1\n  (0, 264)\t1\n  (0, 1552)\t1\n  (0, 4375)\t1\n  (0, 4743)\t1\n  (0, 50)\t1\n  (0, 6656)\t1\n  (0, 6892)\t1\n  (0, 4662)\t1\n  (0, 4779)\t1\n  (0, 2022)\t1'

In [14]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [17]:
y_train[0]

0

In [15]:
# Implementing Naive Bayes using scikit-learn
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
# Prediciting the messages as spam or ham -- testing data
predictions = naive_bayes.predict(testing_data)

In [None]:
# print(testing_data)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.9885139985642498
Precision score: 0.9720670391061452
Recall score: 0.9405405405405406
F1 score: 0.9560439560439562


In [23]:
# testing_data = count_vector.transform(["Best price"])
# predictions = naive_bayes.predict(testing_data)
# print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))

sample = ["Best price", "Free ticket for you", "india best 800", "Black lives matter", "Free entry in 2 a for you "]
testing_data = count_vector.transform(sample)
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 1, 0, 1])