In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

In [4]:
df = pd.read_table('/content/sample_data/SMSSpamCollection.txt', sep='\t', header=None,names=['label', 'sms_message'])

# Output printing out first 5 columns
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham':0,'spam':1})

# Number of rows and column
df.shape

(5572, 2)

In [8]:
# Use from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))



Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [37]:
y_test

1078    0
4028    0
958     0
4642    0
4674    0
       ..
3207    0
4655    0
1140    0
1793    1
1710    0
Name: label, Length: 1393, dtype: int64

In [10]:
# Data preprocessing with CountVectorizer()
# Import the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer

# create an instance of CountVectorizer called 'count_vector'. 
count_vector = CountVectorizer()

In [11]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data
training_data = count_vector.fit_transform(X_train)

# Transform testing data
testing_data = count_vector.transform(X_test)

In [12]:
# Implementing Naive Bayes using scikit-learn
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
# Prediciting the messages as spam or ham -- testing data
predictions = naive_bayes.predict(testing_data)

In [35]:
# print(testing_data)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.9885139985642498
Precision score: 0.9720670391061452
Recall score: 0.9405405405405406
F1 score: 0.9560439560439562


In [58]:
# testing_data = count_vector.transform(["Best price"])
# predictions = naive_bayes.predict(testing_data)
# print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))

sample = ["Best price", "Free ticket for you", "india best 800", "Black lives meter"]
testing_data = count_vector.transform(sample)
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 1, 0])