In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", sep='\t', header=None, names=['label', 'message'])
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
df.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['label'].apply(lambda x: 1 if x =='spam' else 0)

In [9]:
df.shape

(5572, 3)

In [5]:
df.head()

Unnamed: 0,label,message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.message, df.spam, test_size=0.2)

In [10]:
X_train.shape

(4457,)

In [11]:
X_test.shape

(1115,)

In [12]:
type(X_train)

pandas.core.series.Series

In [13]:
X_train[:4]

4206    IMPORTANT INFORMATION 4 ORANGE USER 0796XXXXXX...
4981                               So what u doing today?
870     What do U want for Xmas? How about 100 free te...
3478    I got it before the new year cos yetunde said ...
Name: message, dtype: object

In [14]:
type(y_train)

pandas.core.series.Series

In [15]:
y_train[:4]

4206    1
4981    0
870     1
3478    0
Name: spam, dtype: int64

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59365 stored elements and shape (4457, 7712)>

In [17]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7712,))

In [18]:
X_train_cv.shape

(4457, 7712)

In [20]:
v.get_feature_names_out()[1000:1050]

array(['antibiotic', 'any', 'anybody', 'anyhow', 'anymore', 'anyone',
       'anyones', 'anyplaces', 'anythin', 'anything', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apes', 'apo',
       'apologetic', 'apologise', 'apology', 'app', 'apparently',
       'appeal', 'appendix', 'applebees', 'apples', 'application',
       'apply', 'applying', 'appointment', 'appreciate', 'appreciated',
       'approaches', 'approaching', 'approve', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar',
       'arab', 'arabian', 'arcade'], dtype=object)

In [21]:
dir (v)

['_CountVectorizer__metadata_request__fit',
 '_CountVectorizer__metadata_request__transform',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params_html',
 '_html_repr',
 '_limit_features',
 '_parameter_constraints'

In [22]:
v.vocabulary_

{'important': 3601,
 'information': 3644,
 'orange': 4936,
 'user': 7188,
 '0796xxxxxx': 42,
 'today': 6906,
 'is': 3713,
 'ur': 7167,
 'lucky': 4206,
 'day': 2179,
 'find': 2856,
 'out': 4967,
 'why': 7467,
 'log': 4128,
 'onto': 4908,
 'http': 3507,
 'www': 7608,
 'urawinner': 7168,
 'com': 1898,
 'there': 6796,
 'fantastic': 2774,
 'prizeawaiting': 5379,
 'you': 7672,
 'so': 6244,
 'what': 7439,
 'doing': 2395,
 'do': 2373,
 'want': 7342,
 'for': 2935,
 'xmas': 7618,
 'how': 3494,
 'about': 766,
 '100': 256,
 'free': 2974,
 'text': 6752,
 'messages': 4405,
 'new': 4711,
 'video': 7250,
 'phone': 5134,
 'with': 7513,
 'half': 3294,
 'price': 5360,
 'line': 4082,
 'rental': 5658,
 'call': 1607,
 'now': 4800,
 'on': 4895,
 '0800': 45,
 '0721072': 25,
 'to': 6899,
 'more': 4532,
 'got': 3194,
 'it': 3725,
 'before': 1297,
 'the': 6779,
 'year': 7648,
 'cos': 2015,
 'yetunde': 7662,
 'said': 5836,
 'she': 6035,
 'wanted': 7344,
 'surprise': 6594,
 'but': 1577,
 'when': 7444,
 'didnt': 23

In [41]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7712,))

In [43]:
import numpy as np

In [44]:
np.where(X_train_np[0]!=0)

(array([  42, 1898, 2179, 2774, 2856, 3507, 3601, 3644, 3713, 4128, 4206,
        4908, 4936, 4967, 5379, 6796, 6906, 7167, 7168, 7188, 7467, 7608,
        7672]),)

In [46]:
X_train[:4][4206]

"IMPORTANT INFORMATION 4 ORANGE USER 0796XXXXXX. TODAY IS UR LUCKY DAY!2 FIND OUT WHY LOG ONTO http://www.urawinner.com THERE'S A FANTASTIC PRIZEAWAITING YOU!"

In [49]:
X_train_np[0][2179]

np.int64(1)

In [50]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [51]:
X_test_cv = v.transform(X_test)

In [52]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       971
           1       0.99      0.92      0.96       144

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [53]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [55]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [58]:
clf.fit(X_train, y_train)

0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [59]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       971
           1       0.99      0.92      0.96       144

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

