In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

## Naive Bayes Classifier

### Model 1 using dummy dataset

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',\
         'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

weather_processed = le.fit_transform(weather); print("weather_processed", weather_processed)
temp_processed = le.fit_transform(temp); print("temp_processed", temp_processed)
y_play_processed = le.fit_transform(play); print("play_processed", y_play_processed)

x_features = tuple(zip(weather_processed, temp_processed)); print("x_features", x_features)

weather_processed [2 2 0 1 1 1 0 2 2 1 2 0 0 1]
temp_processed [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
play_processed [0 0 1 1 1 0 1 0 1 1 1 1 1 0]
x_features ((2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2))


In [5]:
model1 = GaussianNB()
model1.fit(x_features, y_play_processed)

print(model1.predict([[2,1]])) # SUNNY & HOT
print("0 indicates that players can not 'play'")
print('')
print(model1.predict([[0,2]])) # OVERCAST & MILD
print("1 indicates that players can 'play'")

[0]
0 indicates that players can not 'play'

[1]
1 indicates that players can 'play'


### Naive Bayes with Multiple Labels

## Model 2

In [6]:
wine = datasets.load_wine()

In [7]:
wine_df = pd.DataFrame(np.column_stack((wine.data, wine.target)), columns=wine.feature_names +['target'])

In [8]:
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


In [11]:
x1_train, x1_test, y1_train, y1_test = train_test_split (wine_df.drop(labels='target', axis=1),
                                                     wine_df['target'], 
                                                     test_size=.33, 
                                                     random_state=1234)

In [12]:
x1_train.shape, x1_test.shape, y1_train.shape, y1_test.shape

((119, 13), (59, 13), (119,), (59,))

In [17]:
model2 = GaussianNB()

model2.fit(x1_train, y1_train)

y1_train_pred = model2.predict(x1_train)
y1_test_pred  = model2.predict(x1_test)

print(classification_report(y1_train, y1_train_pred))
print(classification_report(y1_test,  y1_test_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        41
         1.0       1.00      1.00      1.00        47
         2.0       1.00      1.00      1.00        31

    accuracy                           1.00       119
   macro avg       1.00      1.00      1.00       119
weighted avg       1.00      1.00      1.00       119

              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94        18
         1.0       0.95      0.88      0.91        24
         2.0       0.89      1.00      0.94        17

    accuracy                           0.93        59
   macro avg       0.93      0.94      0.93        59
weighted avg       0.93      0.93      0.93        59



In [41]:
y1_test_pred

array([1., 1., 1., 1., 2., 1., 2., 0., 0., 2., 2., 2., 0., 1., 1., 0., 0.,
       2., 2., 2., 0., 1., 1., 2., 1., 2., 1., 0., 0., 1., 0., 2., 0., 1.,
       1., 1., 0., 2., 0., 0., 2., 0., 1., 2., 1., 1., 0., 2., 1., 2., 0.,
       2., 2., 1., 0., 0., 2., 1., 1.])

## Model 3

In [18]:
news_data = datasets.fetch_20newsgroups(
    subset='all',
    categories=['rec.sport.baseball', 'talk.politics.misc']
    )

In [19]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [20]:
df = pd.DataFrame()
df['data'] = news_data['data']
df['target'] = news_data['target']
df.head()

Unnamed: 0,data,target
0,From: paula@koufax.cv.hp.com (Paul Andresen)\n...,0
1,From: garrett@Ingres.COM \nSubject: Re: Limiti...,1
2,From: djs9683@ritvax.isc.rit.edu\nSubject: Re:...,0
3,From: nickn@eskimo.com (Nick Nussbaum)\nSubjec...,1
4,From: jerry@sheldev.shel.isc-br.com (Gerald La...,0


In [21]:
x_train, x_test, y_train, y_test = train_test_split(df['data'], df['target'])

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1326,), (443,), (1326,), (443,))

In [22]:
count_vector = CountVectorizer()

In [23]:
x_train_processed = count_vector.fit_transform(x_train)

x_test_processed = count_vector.transform(x_test)

In [24]:
x_train_processed

<1326x22739 sparse matrix of type '<class 'numpy.int64'>'
	with 222743 stored elements in Compressed Sparse Row format>

In [25]:
count_vector.get_feature_names()[:8]

['00', '000', '000007', '0000ahc', '000th', '001', '001116', '001211']

In [26]:
pd.DataFrame(x_train_processed.toarray(), columns=count_vector.get_feature_names())

Unnamed: 0,00,000,000007,0000ahc,000th,001,001116,001211,001338,001815,...,zoo,zoologists,zorba,zot,zumwalt,zupcic,zz,zzzzzz,zzzzzzt,ñaustin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
nb = GaussianNB()

In [28]:
nb.fit(x_train_processed.toarray(), y_train)

GaussianNB()

In [29]:
y_train_pred = nb.predict(x_train_processed.toarray())

y_test_pred = nb.predict(x_test_processed.toarray())

In [30]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       744
           1       1.00      1.00      1.00       582

    accuracy                           1.00      1326
   macro avg       1.00      1.00      1.00      1326
weighted avg       1.00      1.00      1.00      1326



In [32]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       250
           1       0.95      0.97      0.96       193

    accuracy                           0.97       443
   macro avg       0.97      0.97      0.97       443
weighted avg       0.97      0.97      0.97       443



In [33]:
df_diff = pd.DataFrame()
df_diff['y_test'] = y_test
df_diff['y_pred'] = y_test_pred
df_diff[:5]

Unnamed: 0,y_test,y_pred
630,1,1
1507,0,0
1584,0,0
1016,0,0
578,1,1


In [34]:
index_list_of_wrong_preds = (df_diff.loc[df_diff['y_test'] != df_diff['y_pred']]).index
index_list_of_wrong_preds

Int64Index([1273, 1564, 1604, 651, 674, 763, 1024, 30, 473, 1738, 897, 792,
            824, 1254],
           dtype='int64')

In [35]:
df.iloc[index_list_of_wrong_preds]['data'].values[1]

"From: fath@mbcrr.dfci.harvard.edu (Michael Fath)\nSubject: HELP:  looking for Cleveland Sports Mailing List Info\nOrganization: Dana-Farber Cancer Institute\nLines: 14\nDistribution: world\nNNTP-Posting-Host: mbcrr.harvard.edu\n\nI'm looking for the address to join the Cleveland Sports Mailing List.\nIf anyone knows it, I would be greatful if they could email a copy of\nit to me.  If you are a member, just mail me one of the List's letters.\nI could probably figure it out from there.\n\nThanks!\n\n\n\n-- \nMM   MM FFFFF \tMichael J. Fath\t\t\nM M M M F\tDept of Microbiology and Molecular Genetics\nM  M  M FFF     Harvard Medical School         \nM     M F       Boston, MA 02115\t            fath@mbcrr.harvard.edu\n"

In [None]:
y_test.shape

In [None]:
y_test_pred.shape

In [None]:
ind = np.array(y_test) != y_test_pred

In [None]:
ind.shape

In [None]:
df.iloc[ind.index].data.values[1]