<a href="https://colab.research.google.com/github/bsong75/brendensong.github.io/blob/main/4.a.NLP_SMS_Logreg_MultiNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics


In [None]:
# read file into pandas from the working directory
#sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])

In [None]:
url = 'https://raw.githubusercontent.com/justmarkham/pydata-dc-2016-tutorial/master/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])

In [None]:
sms.shape

(5572, 2)

In [None]:
sms.head(3)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [None]:
#sms.isnull().sum()
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [None]:
sms.head(3)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1


In [None]:
# Define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [None]:
# split into training and test sets
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_train.shape)

(4179,)
(1393,)
(4179,)
(4179,)


In [None]:
#1. INSTANTIATE THE VECTORIZER
vect=CountVectorizer()

In [None]:
#2. LEARN THE TRAINING DATA VOCABULARY, THEN USE IT CREATE A DOCUMENT-TERM MATRIX
vect.fit(X_train)
X_train_dt=vect.transform(X_train)

In [None]:
X_train_dt

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [None]:
#3. TRANSFORM THE TEST DATA INTO A DOCUMENT-TERM MATRIX
X_test_dt=vect.transform(X_test)
X_test_dt

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [None]:
#4. BUILD AND EVALUATE A MODEL
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
#5. TRAIN THE MODEL USING X-TRAIN-DT
nb.fit(X_train_dt, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
y_pred_class= nb.predict(X_test_dt)

In [None]:
#from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [None]:
X_test[y_test < y_pred_class]

In [None]:
X_test[y_test > y_pred_class]

In [None]:
X_test[3132]

"LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323."

In [None]:
y_pred_probability = nb.predict_proba(X_test_dt)[:,1]
y_pred_probability

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

## COMPARE WITH LOGISTIC REGRESSION

In [None]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 77.8 ms, sys: 792 µs, total: 78.6 ms
Wall time: 81 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred_class = logreg.predict(X_test_dt)

In [None]:
y_pred_prob = logreg.predict_proba(X_test_dt)[:, 1]
y_pred_prob

array([0.00959377, 0.00295662, 0.00452424, ..., 0.031302  , 0.99748962,
       0.00119521])

In [None]:
metrics.accuracy_score(y_test, y_pred_class)

0.9877961234745154

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9936280651512441

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1207,    1],
       [  16,  169]])