# Perform TFIDF Vectorization

In [1]:
msg = ["Shaw likes to play cricket", "Mary likes to play tennis", "John likes to play volleyball or cricket", "Heena likes to play tennis or throwball"]


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
vec = CountVectorizer()
bag = vec.fit_transform(msg).toarray()
print(vec.get_feature_names_out())
df = pd.DataFrame(bag, columns=vec.get_feature_names_out())
df

['cricket' 'heena' 'john' 'likes' 'mary' 'or' 'play' 'shaw' 'tennis'
 'throwball' 'to' 'volleyball']


Unnamed: 0,cricket,heena,john,likes,mary,or,play,shaw,tennis,throwball,to,volleyball
0,1,0,0,1,0,0,1,1,0,0,1,0
1,0,0,0,1,1,0,1,0,1,0,1,0
2,1,0,1,1,0,1,1,0,0,0,1,1
3,0,1,0,1,0,1,1,0,1,1,1,0


## Tf-IDF

In [4]:
vec = TfidfVectorizer()
bag = vec.fit_transform(msg)
print(vec.get_feature_names_out())
df = pd.DataFrame(bag.toarray(), columns=vec.get_feature_names_out())
df

['cricket' 'heena' 'john' 'likes' 'mary' 'or' 'play' 'shaw' 'tennis'
 'throwball' 'to' 'volleyball']


Unnamed: 0,cricket,heena,john,likes,mary,or,play,shaw,tennis,throwball,to,volleyball
0,0.504879,0.0,0.0,0.334174,0.0,0.0,0.334174,0.640375,0.0,0.0,0.334174,0.0
1,0.0,0.0,0.0,0.334174,0.640375,0.0,0.334174,0.0,0.504879,0.0,0.334174,0.0
2,0.391275,0.0,0.496283,0.258981,0.0,0.391275,0.258981,0.0,0.0,0.0,0.258981,0.496283
3,0.0,0.496283,0.0,0.258981,0.0,0.391275,0.258981,0.0,0.391275,0.496283,0.258981,0.0


## Word2Vec

In [5]:
import nltk

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
with open('Resources/word2vec.txt') as f:
    text = f.readlines()
text[0]

'With Bag of Words and TF-IDF text vectorization techniques we did not get semantic meaning of words But for most of the applications of NLP tasks like sentiment classification, sarcasm detection etc require semantic meaning of a word and semantic relationships of a word with other words.\n'

In [8]:
import re
for i in range(len(text)):
    text[i] = re.sub(r'[^a-zA-Z]', " ", text[i])
print("Number of sentences: ", len(text))
print("First Sentence: ", text[0])
        

Number of sentences:  11
First Sentence:  With Bag of Words and TF IDF text vectorization techniques we did not get semantic meaning of words But for most of the applications of NLP tasks like sentiment classification  sarcasm detection etc require semantic meaning of a word and semantic relationships of a word with other words  


In [10]:
tokens =[]
for i in text:
    tokens.append(word_tokenize(i))
tokens[0]

['With',
 'Bag',
 'of',
 'Words',
 'and',
 'TF',
 'IDF',
 'text',
 'vectorization',
 'techniques',
 'we',
 'did',
 'not',
 'get',
 'semantic',
 'meaning',
 'of',
 'words',
 'But',
 'for',
 'most',
 'of',
 'the',
 'applications',
 'of',
 'NLP',
 'tasks',
 'like',
 'sentiment',
 'classification',
 'sarcasm',
 'detection',
 'etc',
 'require',
 'semantic',
 'meaning',
 'of',
 'a',
 'word',
 'and',
 'semantic',
 'relationships',
 'of',
 'a',
 'word',
 'with',
 'other',
 'words']

In [13]:
from gensim.models import Word2Vec
cbow = Word2Vec(sentences=tokens, min_count=2, vector_size=2)
word = cbow.wv.index_to_key
word

['of',
 'and',
 'words',
 'a',
 'vector',
 'the',
 'word',
 'space',
 'Word',
 'in',
 'semantic',
 'context',
 'method',
 'gram',
 'to',
 'vec',
 'skip',
 'as',
 'are',
 'we',
 'meaning',
 'for',
 'most',
 'center',
 'be',
 'input',
 'relationships',
 'with',
 'other',
 'corpus',
 'embeddings',
 'well',
 'size',
 'is']

In [14]:
print("Total Words: ", len(word))
cbow.wv.__getitem__('and')

Total Words:  34


array([0.255022, 0.450747], dtype=float32)