# **CountVectorizer**

In [0]:
import pandas as pd
import math
import numpy as np

In [0]:
messages = ["Hey hey hey lets go get lunch today :)",
           "Did you go home?",
           "Hey!!! I need a favor"]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(lowercase=False)

In [4]:
vect.fit(messages)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [5]:
vect.get_feature_names()

['Did',
 'Hey',
 'favor',
 'get',
 'go',
 'hey',
 'home',
 'lets',
 'lunch',
 'need',
 'today',
 'you']

In [0]:
dtm = vect.transform(messages)

In [0]:
messages = ["Hey hey hey lets go get lunch today :)",
           "Did you go home?",
           "Hey!!! I need a favor i_phone"]

In [8]:
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,Did,Hey,favor,get,go,hey,home,lets,lunch,need,today,you
0,0,1,0,1,1,2,0,1,1,0,1,0
1,1,0,0,0,1,0,1,0,0,0,0,1
2,0,1,1,0,0,0,0,0,0,1,0,0


In [9]:
new_message = ['Hey lets go get a drink tonight']
new_dtm = vect.transform(new_message)
pd.DataFrame(new_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,Did,Hey,favor,get,go,hey,home,lets,lunch,need,today,you
0,0,1,0,1,1,0,0,1,0,0,0,0


In [10]:
messages.append(new_message[0])
messages

['Hey hey hey lets go get lunch today :)',
 'Did you go home?',
 'Hey!!! I need a favor i_phone',
 'Hey lets go get a drink tonight']

In [11]:
dtm = vect.fit_transform(messages)
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,Did,Hey,drink,favor,get,go,hey,home,i_phone,lets,lunch,need,today,tonight,you
0,0,1,0,0,1,1,2,0,0,1,1,0,1,0,0
1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1
2,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0
3,0,1,1,0,1,1,0,0,0,1,0,0,0,1,0


# **TfidfVectorizer**

**tf(t,d) = count of t in d / number of words in d**

<b>df(t) = occurrence of t in documents  
idf(t) = N/df  
idf(t) = log(N/(df + 1))
 </b>

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
def createDTM(messages):
    vect = TfidfVectorizer()
    dtm = vect.fit_transform(messages) # create DTM
    
    # create pandas dataframe of DTM
    return pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names()) 

In [13]:
messages = ["Hey lets get lunch :)",
           "Hey!!! I need a favor"]
createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.534046,0.379978,0.534046,0.534046,0.0
1,0.631667,0.0,0.449436,0.0,0.0,0.631667


In [14]:
messages = ["Hey hey hey lets get lunch :)",
           "Hey!!! I need a favor"]
createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.363788,0.776515,0.363788,0.363788,0.0
1,0.631667,0.0,0.449436,0.0,0.0,0.631667


In [15]:
messages = ["Hey hey hey lets get lunch :)",
           "I need a favor"]
createDTM(messages)

Unnamed: 0,favor,get,hey,lets,lunch,need
0,0.0,0.288675,0.866025,0.288675,0.288675,0.0
1,0.707107,0.0,0.0,0.0,0.0,0.707107


In [0]:
docA = "The cat cat sat on my face"
docB = "The dog sat on my bed"

In [0]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [18]:
bowA

['The', 'cat', 'cat', 'sat', 'on', 'my', 'face']

In [0]:
wordSet = set(bowA).union(set(bowB))

In [20]:
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [0]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)

In [22]:
wordDictA

{'The': 0, 'bed': 0, 'cat': 0, 'dog': 0, 'face': 0, 'my': 0, 'on': 0, 'sat': 0}

In [0]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [24]:
wordDictA

{'The': 1, 'bed': 0, 'cat': 2, 'dog': 0, 'face': 1, 'my': 1, 'on': 1, 'sat': 1}

In [25]:
wordDictB

{'The': 1, 'bed': 1, 'cat': 0, 'dog': 1, 'face': 0, 'my': 1, 'on': 1, 'sat': 1}

In [26]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,1,0,2,0,1,1,1,1
1,1,1,0,1,0,1,1,1


In [0]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [0]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [0]:
docA = "The cat sat on my face"

In [30]:
tfBowA

{'The': 0.14285714285714285,
 'bed': 0.0,
 'cat': 0.2857142857142857,
 'dog': 0.0,
 'face': 0.14285714285714285,
 'my': 0.14285714285714285,
 'on': 0.14285714285714285,
 'sat': 0.14285714285714285}

In [0]:
def computeIDF(docList):
  idfDict = {}
  N = len(docList)
  print ("Total Number of Documents - ", N)
  idfDict = dict.fromkeys(docList[0].keys(), 0)
  for doc in docList:
    for word, val in doc.items():
      if val > 0:
        idfDict[word] += 1
  print (idfDict)
  for word, val in idfDict.items():
    idfDict[word] = abs(np.log(float(N) / val + 1))
  print (idfDict)
  return idfDict

In [0]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [33]:
idfs = computeIDF([wordDictA, wordDictB])

Total Number of Documents -  2
{'cat': 1, 'sat': 2, 'face': 1, 'on': 2, 'my': 2, 'The': 2, 'bed': 1, 'dog': 1}
{'cat': 1.0986122886681098, 'sat': 0.6931471805599453, 'face': 1.0986122886681098, 'on': 0.6931471805599453, 'my': 0.6931471805599453, 'The': 0.6931471805599453, 'bed': 1.0986122886681098, 'dog': 1.0986122886681098}


In [0]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [0]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [36]:
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,0.099021,0.0,0.313889,0.0,0.156945,0.099021,0.099021,0.099021
1,0.115525,0.183102,0.0,0.183102,0.0,0.115525,0.115525,0.115525


In [0]:
msg = ["The cat cat sat on my face", 
       "The dog sat on my bed"]

In [38]:
createDTM(msg)

Unnamed: 0,bed,cat,dog,face,my,on,sat,the
0,0.0,0.754584,0.0,0.377292,0.268446,0.268446,0.268446,0.268446
1,0.498446,0.0,0.498446,0.0,0.354649,0.354649,0.354649,0.354649
