## Using Pandas to create a DataFrame

In [3]:
# DO NOT EXECUTE THIS LINE
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
# DO NOT EXECUTE THIS LINE
%cd /content/gdrive/MyDrive/GitHub/python-notebook/Natural\ Language\ Processing/SpamClassifier/
%ls

/content/gdrive/MyDrive/GitHub/python-notebook/Natural Language Processing/SpamClassifier
SMSSpamCollection  SpamClassifier.ipynb


In [6]:
import pandas as pd

df = pd.read_table('SMSSpamCollection', sep="\t", names=['label', 'sms_message'])

df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Convert our labels to binary variables
df['label'] = df.label.map({'ham': 0, 'spam': 1})

## Training and testing sets
- `X_train`: Training data for the 'sms_message' column.
- `y_train`: Training data for the 'label' column.
- `X_test`: Testing data for the 'sms_message' column.
- `y_test`: Testing data for the 'label' column.

In [8]:
# For more information about train_test_split:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                    df['label'],
                                                    random_state=1)

print(f'Number of rows in the total set: {df.shape[0]}')
print(f'Number of rows in the training set: {X_train.shape[0]}')
print(f'Number of rows in the test set: {X_test.shape[0]}')

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


### Applying a Bag of Words for processing our dataset
This will allow us to feed the sms_messages to our ML algorithm by converting the text data into numerical data

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data (but don't fit) and return the matrix
testing_data = count_vector.transform(X_test)

## Naive Bayes implementation using scikit-learn

In [11]:
from sklearn.naive_bayes import MultinomialNB

In [12]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
predictions = naive_bayes.predict(testing_data)

## Evaluation
**Accuracy**:

`correct_predictions/total_predictions`

**Precision**: 

`[True Positives/(True Positives + False Positives)]`

**Recall (sensitivity)**:

`[True Positives/(True Positives + False Negatives)]`

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [15]:
print(f'Accuracy score: {accuracy_score(y_test, predictions)}')
print(f'Precision score: {precision_score(y_test, predictions)}')
print(f'Recall score: {recall_score(y_test, predictions)}')
print(f'F1 score: {f1_score(y_test, predictions)}')

Accuracy score: 0.9885139985642498
Precision score: 0.9720670391061452
Recall score: 0.9405405405405406
F1 score: 0.9560439560439562
