In [89]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [90]:
corpus =['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!' 
        ]
labels =['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

In [91]:
corpus = np.array(corpus)

In [92]:
corpusdf = pd.DataFrame({'Document' :corpus, 'Category':labels})

In [93]:
corpusdf.head()

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food


In [94]:
corpusdf.shape

(8, 2)

Text Processing
1) Removing tags, if any - like html etc
2) Removing accented characters - accented characters are used to emphasis context or it sometimes gives meaning to words
3) Expanding contractions - words like don't instead of donot, I'd instead of I would are often used in written english. Such worlds need to be expanded in order to create right features out of given words in text document
4) Removing special characters - special characters such as @, # etc
5) Stemming : Word stems are base form of possible words which are affixed with words to create new words
   E.g. Watches, Watching, Watched from Watch word.
   This process is known as inflection
   The reverse process of obtaining the base form of word is know as stemming
6) Lemmatization : This is also similar to stemming which involves obtaining base form of word.
   The base word or root word obatined through lemmatization process is always lexicographically correct 
   word but the root stem may not be.
7) Removing Stop words - These are those words which may appear mutliple times in documents and doe snot contribute towards featyre buiding. There is no universal stop word list. Depending upon problem space, we can create our own list. Words like a, an, the, and 

In [95]:
tokenizer = nltk.WordPunctTokenizer()

In [96]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/amol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
def processText(doc) :
    #apply lower case
    doc = doc.lower()
    doc = doc.strip()
    doc = re.sub(r'[^a-zA-Z\s]',' ', doc, re.I|re.A)
    tokens = tokenizer.tokenize(doc)
    filter_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filter_tokens)
    return doc

In [98]:
normal_corpus = np.vectorize(processText)

In [99]:
corplst = normal_corpus(corpus)

In [100]:
corplst

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'king breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U50')

In [103]:
type(corplst)

numpy.ndarray

In [130]:
from sklearn.feature_extraction.text import CountVectorizer

In [131]:
vtr = CountVectorizer()

In [132]:
vtr_tras = vtr.fit_transform(corplst)

In [136]:
felst = vtr.get_feature_names()

In [133]:
vtr_tras = vtr_tras.toarray()

In [138]:
df2 = pd.DataFrame(vtr_tras, columns = felst)

In [139]:
df2.head(10)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,king,lazy,love,quick,sausages,sky,toast,today
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0
3,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0
5,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1
7,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0


In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [106]:
vt = TfidfVectorizer()

In [107]:
vt_trans = vt.fit_transform(corplst)

In [108]:
fealst = vt.get_feature_names()

In [111]:
type(vt_trans)

scipy.sparse.csr.csr_matrix

In [116]:
vt_arr = vt_trans.todense()

In [118]:
vt_arr = np.array(vt_arr)

In [121]:
df1 = pd.DataFrame(vt_arr, columns=fealst)

In [122]:
df1

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,king,lazy,love,quick,sausages,sky,toast,today
0,0.0,0.0,0.600978,0.526925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600978,0.0,0.0
1,0.0,0.0,0.493162,0.432394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571505,0.0,0.0,0.493162,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.380362,0.380362,0.0,0.380362,0.0,0.0,0.525949,0.0,0.380362,0.0,0.380362,0.0,0.0,0.0,0.0
3,0.321164,0.383215,0.0,0.0,0.383215,0.0,0.0,0.321164,0.0,0.0,0.321164,0.0,0.383215,0.0,0.0,0.0,0.321164,0.0,0.383215,0.0
4,0.394554,0.0,0.0,0.0,0.0,0.0,0.0,0.394554,0.0,0.470784,0.394554,0.0,0.0,0.0,0.394554,0.0,0.394554,0.0,0.0,0.0
5,0.0,0.0,0.0,0.365048,0.0,0.416351,0.416351,0.0,0.416351,0.0,0.0,0.0,0.0,0.416351,0.0,0.416351,0.0,0.0,0.0,0.0
6,0.0,0.0,0.360826,0.316365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.721652,0.0,0.498935
7,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.447214,0.0,0.0,0.0,0.0


In [72]:
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
cv = CountVectorizer()

In [127]:
vt = cv.fit(corplst)

In [129]:
vt_tran = cv.transform(corplst)

In [130]:
vt_tran.shape

(8, 20)

In [132]:
vtarr = vt_tran.toarray()

In [134]:
features = cv.get_feature_names()

In [135]:
features

['bacon',
 'beans',
 'beautiful',
 'blue',
 'breakfast',
 'brown',
 'dog',
 'eggs',
 'fox',
 'green',
 'ham',
 'jumps',
 'king',
 'lazy',
 'love',
 'quick',
 'sausages',
 'sky',
 'toast',
 'today']

In [138]:
df = pd.DataFrame(vtarr,columns=features)

In [139]:
df

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,king,lazy,love,quick,sausages,sky,toast,today
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0
3,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0
5,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1
7,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0


In [140]:
list(cv.vocabulary_.items())

[('sky', 17),
 ('blue', 3),
 ('beautiful', 2),
 ('love', 14),
 ('quick', 15),
 ('brown', 5),
 ('fox', 8),
 ('jumps', 11),
 ('lazy', 13),
 ('dog', 6),
 ('king', 12),
 ('breakfast', 4),
 ('sausages', 16),
 ('ham', 10),
 ('bacon', 0),
 ('eggs', 7),
 ('toast', 18),
 ('beans', 1),
 ('green', 9),
 ('today', 19)]

In [141]:
df['Class'] = labels

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,...,jumps,king,lazy,love,quick,sausages,sky,toast,today,Class
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,weather
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,weather
2,0,0,0,0,0,1,1,0,1,0,...,1,0,1,0,1,0,0,0,0,animals
3,1,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,food
4,1,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,food
5,0,0,0,1,0,1,1,0,1,0,...,0,0,1,0,1,0,0,0,0,animals
6,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,weather
7,0,0,0,0,0,1,1,0,1,0,...,0,0,1,0,1,0,0,0,0,animals


In [143]:
from sklearn import preprocessing

en = preprocessing.LabelEncoder()

In [144]:
enlabels = en.fit(df['Class'])

In [146]:
enlabels.classes_

array(['animals', 'food', 'weather'], dtype=object)

In [147]:
enlabels = en.transform(df['Class'])

In [148]:
enlabels

array([2, 2, 0, 1, 1, 0, 2, 0])

In [149]:
labels

['weather',
 'weather',
 'animals',
 'food',
 'food',
 'animals',
 'weather',
 'animals']

In [150]:
df['Labels'] = enlabels

In [151]:
df

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,...,king,lazy,love,quick,sausages,sky,toast,today,Class,Labels
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,weather,2
1,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,weather,2
2,0,0,0,0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,0,animals,0
3,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,1,0,1,0,food,1
4,1,0,0,0,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,food,1
5,0,0,0,1,0,1,1,0,1,0,...,0,1,0,1,0,0,0,0,animals,0
6,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,2,0,1,weather,2
7,0,0,0,0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,0,animals,0


In [152]:
from sklearn.metrics import accuracy_score

In [153]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()


In [154]:
model.fit(vt_tran,enlabels)

BernoulliNB()

In [155]:
pred_train = model.predict(vt_tran)

In [157]:
print(accuracy_score(enlabels,pred_train))

1.0


In [158]:
pred_train

array([2, 2, 0, 1, 1, 0, 2, 0])

In [159]:
enlabels

array([2, 2, 0, 1, 1, 0, 2, 0])