In [1]:
import pandas as pd

columns = ['sent', 'class']
rows = []

rows = [['This is my book', 'stmt'], 
        ['They are novels', 'stmt'],
        ['have you read this book', 'question'],
        ['who is the author', 'question'],
        ['what are the characters', 'question'],
        ['This is how I bought the book', 'stmt'],
        ['I like fictions', 'stmt'],
        ['what is your favorite book', 'question']]

training_data = pd.DataFrame(rows, columns=columns)
training_data

Unnamed: 0,sent,class
0,This is my book,stmt
1,They are novels,stmt
2,have you read this book,question
3,who is the author,question
4,what are the characters,question
5,This is how I bought the book,stmt
6,I like fictions,stmt
7,what is your favorite book,question


Count vector for statement class

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

stmt_docs = [row['sent'] for index,row in training_data.iterrows() if row['class'] == 'stmt']

vec_s = CountVectorizer()
X_s = vec_s.fit_transform(stmt_docs)
tdm_s = pd.DataFrame(X_s.toarray(), columns=vec_s.get_feature_names())

tdm_s

Unnamed: 0,are,book,bought,fictions,how,is,like,my,novels,the,they,this
0,0,1,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,0,0,0,0,1,0,1,0
2,0,1,1,0,1,1,0,0,0,1,0,1
3,0,0,0,1,0,0,1,0,0,0,0,0


Count vector for Question Class

In [3]:
q_docs = [row['sent'] for index,row in training_data.iterrows() if row['class'] == 'question']

vec_q = CountVectorizer()
X_q = vec_q.fit_transform(q_docs)
tdm_q = pd.DataFrame(X_q.toarray(), columns=vec_q.get_feature_names())

tdm_q

Unnamed: 0,are,author,book,characters,favorite,have,is,read,the,this,what,who,you,your
0,0,0,1,0,0,1,0,1,0,1,0,0,1,0
1,0,1,0,0,0,0,1,0,1,0,0,1,0,0
2,1,0,0,1,0,0,0,0,1,0,1,0,0,0
3,0,0,1,0,1,0,1,0,0,0,1,0,0,1


Feature Counts for stmt class (12)

In [4]:
word_list_s = vec_s.get_feature_names();    
count_list_s = X_s.toarray().sum(axis=0) 
freq_s = dict(zip(word_list_s,count_list_s))
freq_s

{'are': 1,
 'book': 2,
 'bought': 1,
 'fictions': 1,
 'how': 1,
 'is': 2,
 'like': 1,
 'my': 1,
 'novels': 1,
 'the': 1,
 'they': 1,
 'this': 2}

Feature Counts for Question Class(14)

In [5]:

word_list_q = vec_q.get_feature_names();    
count_list_q = X_q.toarray().sum(axis=0) 
freq_q = dict(zip(word_list_q,count_list_q))
freq_q

{'are': 1,
 'author': 1,
 'book': 2,
 'characters': 1,
 'favorite': 1,
 'have': 1,
 'is': 2,
 'read': 1,
 'the': 2,
 'this': 1,
 'what': 2,
 'who': 1,
 'you': 1,
 'your': 1}

Total Count of Features in the Training Set

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

docs = [row['sent'] for index,row in training_data.iterrows()]

vec = CountVectorizer()
X = vec.fit_transform(docs)

total_features = len(vec.get_feature_names())
print ('total_features:', total_features)
vec.get_feature_names()

total_features: 21


['are',
 'author',
 'book',
 'bought',
 'characters',
 'favorite',
 'fictions',
 'have',
 'how',
 'is',
 'like',
 'my',
 'novels',
 'read',
 'the',
 'they',
 'this',
 'what',
 'who',
 'you',
 'your']

Total Count of Class

In [13]:

total_cnts_features_s = count_list_s.sum(axis=0)
total_cnts_features_q = count_list_q.sum(axis=0)
print ('total_cnts_features_s：',total_cnts_features_s)
print ('total_cnts_features_q：：',total_cnts_features_q)

total_cnts_features_s： 15
total_cnts_features_q：： 18


In [17]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
new_sentence = 'what is the price of the book'
new_word_list = word_tokenize(new_sentence)
new_word_list

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['what', 'is', 'the', 'price', 'of', 'the', 'book']

Laplace Smoothing Stmt Class

In [18]:

prob_s_with_ls = []
for word in new_word_list:    
    if word in freq_s.keys():
        count = freq_s[word]
    else:
        count = 0
    print ('word:', word, "count:", count)
    prob_s_with_ls.append((count + 1)/(total_cnts_features_s + total_features))
dict(zip(new_word_list,prob_s_with_ls))

word: what count: 0
word: is count: 2
word: the count: 1
word: price count: 0
word: of count: 0
word: the count: 1
word: book count: 2


{'book': 0.08333333333333333,
 'is': 0.08333333333333333,
 'of': 0.027777777777777776,
 'price': 0.027777777777777776,
 'the': 0.05555555555555555,
 'what': 0.027777777777777776}

Laplace Smoothing Question Class

In [14]:
prob_q_with_ls = []
for word in new_word_list:
    if word in freq_q.keys():
        count = freq_q[word]
    else:
        count = 0
    prob_q_with_ls.append((count + 1)/(total_cnts_features_q + total_features))
dict(zip(new_word_list,prob_q_with_ls))

{'book': 0.07692307692307693,
 'is': 0.07692307692307693,
 'of': 0.02564102564102564,
 'price': 0.02564102564102564,
 'the': 0.07692307692307693,
 'what': 0.07692307692307693}

Conditional Probability Given stmt with no Smoothing

In [28]:
prob_s = []
t_count = 0
for count in count_list_s:
  t_count += count
print ('total count:', t_count)
for count in count_list_s:
    prob_s.append (count/t_count)
dict(zip(word_list_s,prob_s))

total count: 15


{'are': 0.06666666666666667,
 'book': 0.13333333333333333,
 'bought': 0.06666666666666667,
 'fictions': 0.06666666666666667,
 'how': 0.06666666666666667,
 'is': 0.13333333333333333,
 'like': 0.06666666666666667,
 'my': 0.06666666666666667,
 'novels': 0.06666666666666667,
 'the': 0.06666666666666667,
 'they': 0.06666666666666667,
 'this': 0.13333333333333333}

Conditional Probability Given question without Smoothing

In [27]:
prob_q = []
t_count = 0
for count in count_list_q:
  t_count += count
print ('total count:', t_count)
for count in count_list_q:
    prob_q.append (count/t_count)
dict(zip(word_list_q,prob_q))


total count: 18


{'are': 0.05555555555555555,
 'author': 0.05555555555555555,
 'book': 0.1111111111111111,
 'characters': 0.05555555555555555,
 'favorite': 0.05555555555555555,
 'have': 0.05555555555555555,
 'is': 0.1111111111111111,
 'read': 0.05555555555555555,
 'the': 0.1111111111111111,
 'this': 0.05555555555555555,
 'what': 0.1111111111111111,
 'who': 0.05555555555555555,
 'you': 0.05555555555555555,
 'your': 0.05555555555555555}