### Import

In [None]:
import numpy as np
import pandas as pd
import sklearn

import warnings
warnings.simplefilter('ignore')

### Load Data / Prepare Data

In [None]:
# create some dummy data
docs = ['NSUT is a great educational institution.',
        'Educational greatness depends on ethics',
        'A story of great ethics and educational greatness',
        'Sholey is a great cinema',
        'good movie depends on good story',
        'very good educational institute']
label = ['education','education','education','cinema','cinema','education' ]

In [None]:
# create a dataframe
df = pd.DataFrame(list(zip(docs, label)), columns = ['docs','class'])

In [None]:
# display the data created
df

Unnamed: 0,docs,class
0,NSUT is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema
5,very good educational institute,education


In [None]:
# Train-Test Split
train, test = df.iloc[:5,:],df.iloc[5:,:]

#### Lets transform the data so that we can apply baysian classifier

In [None]:
# conver the labels to intergers. As there are only two classes we use 0 and 1
train['label'] = train['class'].map({'education':0, 'cinema':1})
train

Unnamed: 0,docs,class,label
0,NSUT is a great educational institution.,education,0
1,Educational greatness depends on ethics,education,0
2,A story of great ethics and educational greatness,education,0
3,Sholey is a great cinema,cinema,1
4,good movie depends on good story,cinema,1


In [None]:
# drop the class column
train = train.drop('class', axis = 1)
train

Unnamed: 0,docs,label
0,NSUT is a great educational institution.,0
1,Educational greatness depends on ethics,0
2,A story of great ethics and educational greatness,0
3,Sholey is a great cinema,1
4,good movie depends on good story,1


#### Lets create the dictionary

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# help(CountVectorizer)

In [None]:
vec = CountVectorizer()
vec.fit(train['docs'])
vec.vocabulary_

{'nsut': 11,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 13,
 'ethics': 4,
 'story': 15,
 'of': 12,
 'and': 0,
 'sholey': 14,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [None]:
print('dictionary created is: ',vec.get_feature_names())
print('Length of the dictionary is: ', len(vec.get_feature_names()))

dictionary created is:  ['and', 'cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'is', 'movie', 'nsut', 'of', 'on', 'sholey', 'story']
Length of the dictionary is:  16


#### Remove Stop Words as they do not contribute to the classification method

In [None]:
# create the CountVectorizer object again but set the argument stop word to 'english'. The stop words will be removed 
# when you fit the documents
vec2 = CountVectorizer(stop_words = 'english')
vec2.fit(train['docs'])
print('dictionary created is: ',vec2.get_feature_names())
print('Length of the dictionary is: ', len(vec2.get_feature_names()))
vec2.vocabulary_

dictionary created is:  ['cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'movie', 'nsut', 'sholey', 'story']
Length of the dictionary is:  12


{'nsut': 9,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 11,
 'sholey': 10,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [None]:
# Convert the train docs to vector form
X_vectors = vec2.transform(train['docs'])
X_vectors

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [None]:
print(vec2.get_feature_names())
X_vectors.toarray()

['cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'movie', 'nsut', 'sholey', 'story']


array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1]], dtype=int64)

In [None]:
print(X_vectors)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 11)	1
  (3, 0)	1
  (3, 5)	1
  (3, 10)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 11)	1


In [None]:
# transform the test set as well
test

Unnamed: 0,docs,class
5,very good educational institute,education


In [None]:
# first convert the class labels to intergers
test['label'] = test['class'].map({'education':0, 'cinema':1})
# drop the class column as we did for train
test.drop('class', axis = 1)

# now transform as per our dictionary
test_vectors = vec2.transform(test['docs'])
print(test_vectors.toarray())

[[0 0 1 0 1 0 0 0 0 0 0 0]]


### Fit the Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X_vectors, train['label'] )



MultinomialNB()

### Test the Model

In [None]:
mnb.predict_proba(test_vectors)

array([[0.56140351, 0.43859649]])

In [None]:
prediction = mnb.predict(test_vectors)
prediction[0]

0

# Load sklearn's: The 20 newsgroups text dataset and build N.B Classifier

In [None]:
# code for loading the dataset

from sklearn.datasets import fetch_20newsgroups
#newsgroups_train = fetch_20newsgroups(subset = 'train', remove=('headers', 'footers', 'quotes'))
#newsgroups_test = fetch_20newsgroups(subset = 'test', remove=('headers', 'footers', 'quotes'))

#### The real data lies in the 'filenames' and 'target' attributes. The target attribute is the integer index of the category:

In [None]:
print('shape of train dataset: ',newsgroups_train.filenames.shape)
print('shape of test dataset',newsgroups_test.filenames.shape)

In [None]:
# to access target variable use
print('target data : ',newsgroups_train.target)
print('shape of target data', newsgroups_train.target.shape)
print('Number of categories: ', len(newgroups_train.target_names))
print('Name of categories: ', newsgroups_train.target_names)

In [None]:
# To ACCESS DATA use
# newsgroups_train.data
# to print first instance
newsgroups_train.data[0]

## Create the dictionary

## Vectorize the train and test set

## Fit the model

## Make Prediction

## Evaluate the model

In [None]:
from sklearn import metrics
metrics.accuracy_score(newgroups_test.target, test_pred)
metrics.f1_score(newsgroups_test.target, pred, average='macro')