In [1]:
# Supervised learning = trained using labeled data.

In [2]:
# ** Supervised learning is used in apps where historical data accurately predicts future data **

In [3]:
# Accuracy = num of correct predictions / total predictions
# Recall = true positives / (true positives + false negatives)
# Precision
# F1 score 

In [4]:
# False positive = type 1 error
# False negative = type 2 error (more critical - you do NOT have the disease...)

In [14]:
# Sklearn
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')

In [16]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [17]:
# Is anything missing
df.isnull()

Unnamed: 0,label,message,length,punct
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
5567,False,False,False,False
5568,False,False,False,False
5569,False,False,False,False
5570,False,False,False,False


In [20]:
# Get counts of nulls, we are not missing any data
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [21]:
len(df)

5572

In [22]:
df['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [23]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [24]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split

In [57]:
features = df[['length', 'punct']]
labels = df['label']

(
    x_train,
    y_train,
    x_test,
    y_test
) = train_test_split(features, labels, test_size=0.3, random_state=42)

In [34]:
train_feat.shape

(3900, 2)

In [35]:
test_feat.shape

(1672, 2)

In [36]:
from sklearn.linear_model import LogisticRegression

In [38]:
lr_model = LogisticRegression(solver='lbfgs')

In [39]:
lr_model.fit(train_feat, train_lbl)

In [40]:
from sklearn import metrics

In [41]:
predictions = lr_model.predict(test_feat)

In [42]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [43]:
test_lbl

3245     ham
944      ham
1044     ham
2484     ham
812      ham
        ... 
2505     ham
2525    spam
4975     ham
650     spam
4463     ham
Name: label, Length: 1672, dtype: object

In [44]:
# Conf matrix
metrics.confusion_matrix(test_lbl, predictions)

array([[1404,   44],
       [ 219,    5]])

In [46]:
# Classification report
metrics.classification_report(test_lbl, predictions)

'              precision    recall  f1-score   support\n\n         ham       0.87      0.97      0.91      1448\n        spam       0.10      0.02      0.04       224\n\n    accuracy                           0.84      1672\n   macro avg       0.48      0.50      0.48      1672\nweighted avg       0.76      0.84      0.80      1672\n'

In [47]:
metrics.accuracy_score(test_lbl, predictions)

0.8427033492822966

In [49]:
from sklearn.naive_bayes import MultinomialNB

In [50]:
nb_model = MultinomialNB()

In [51]:
nb_model.fit(train_feat, train_lbl)

In [52]:
preds = nb_model.predict(test_feat)

In [53]:
preds

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [55]:
print(metrics.classification_report(test_lbl, preds))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672

