## Setup

In [2]:
!pip install -qq boto3

## Imports

In [49]:
import boto3
import os
import zipfile
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Download data

In [11]:
env = os.environ["ENVIRONMENT"]
env

'dev'

In [17]:
s3c = boto3.client("s3",                               
                      aws_session_token=None,                              
                      config=boto3.session.Config(signature_version='s3v4'),
                      endpoint_url="http://192.168.48.3:9000",
                      verify=False 
                 )

In [18]:
s3c.list_objects_v2(Bucket=env)

{'ResponseMetadata': {'RequestId': '173D4205D50CE954',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '635',
   'content-security-policy': 'block-all-mixed-content',
   'content-type': 'application/xml',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-request-id': '173D4205D50CE954',
   'x-content-type-options': 'nosniff',
   'x-xss-protection': '1; mode=block',
   'date': 'Tue, 24 Jan 2023 13:33:49 GMT'},
  'RetryAttempts': 0},
 'IsTruncated': False,
 'Contents': [{'Key': 'data/raw/smsspamcollection.zip',
   'LastModified': datetime.datetime(2023, 1, 22, 14, 38, 30, 968000, tzinfo=tzlocal()),
   'ETag': '"ab53f9571d479ee677e7b283a06a661a"',
   'Size': 203415,
   'StorageClass': 'STANDARD',
   'Owner': {'DisplayName': 'minio',
    'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}}],
 'Name': 'dev',
 'Prefix': '',

In [20]:
s3c.download_file(Bucket=env, Key="data/raw/smsspamcollection.zip" , Filename="smsspamcollection.zip")

In [23]:
with zipfile.ZipFile("smsspamcollection.zip", "r") as zip_ref:
    zip_ref.extractall("dataset")

## Config

In [26]:
raw_data = "dataset/SMSSpamCollection"

## Load dataset

In [27]:
raw_df = pd.read_table(raw_data,
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

In [28]:
raw_df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
raw_df['label'] = raw_df["label"].map({'ham':0, 'spam':1})
print(raw_df.shape)
raw_df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## BoW

In [35]:
count_vector = CountVectorizer()

In [36]:
dummy_doc = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [37]:
count_vector.fit(dummy_doc)
count_vector.get_feature_names_out()

array(['are', 'call', 'from', 'hello', 'home', 'how', 'me', 'money',
       'now', 'tomorrow', 'win', 'you'], dtype=object)

In [38]:
doc_vec = count_vector.transform(dummy_doc).toarray()
doc_vec

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [39]:
frequency_matrix = pd.DataFrame(doc_vec, 
                                columns = count_vector.get_feature_names_out())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


## Train-Test split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(raw_df['sms_message'], 
                                                    raw_df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(raw_df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


## Feature Processing

In [43]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

## Training

In [45]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

In [46]:
predictions = naive_bayes.predict(testing_data)

## Evaluation

In [50]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


## References

- https://olgabelitskaya.github.io/MLE_ND_PP0.html