# https://github.com/avishek-018/CLBLP-2023/

## TF-IDF and Count Vectorizer

Objective:

Apply Count Vectorizer and Tf-idf.

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install scikit-learn==0.21.2

In [None]:
import sklearn
print(sklearn.__version__)

1.0.2


In [None]:
%%time

import pandas as pd

CPU times: user 2.33 s, sys: 265 ms, total: 2.6 s
Wall time: 3.07 s


In [None]:
!pip install bnlp_toolkit # installing for Bangla toolkit

In [None]:
!wget -O Emotion.csv https://www.dropbox.com/s/pjzwlxbmm7sq4sf/emotion.csv?dl=0

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [None]:
sentences = [
    "আমি বাংলায় গান গাই",
    "আমি বাংলার গান গাই",
    "আমি আমার আমিকে চিরদিন এই বাংলায় খুঁজে পাই"
]

### Bag of Words

In [None]:
cv = CountVectorizer(tokenizer= lambda x: x.split())
# cv = CountVectorizer(ngram_range=(1,1), min_df=1, tokenizer=lambda x: x.split())
cv_corpus = cv.fit_transform(sentences)

print(cv.get_feature_names())
print(cv.vocabulary_)

['আমার', 'আমি', 'আমিকে', 'এই', 'খুঁজে', 'গাই', 'গান', 'চিরদিন', 'পাই', 'বাংলার', 'বাংলায়']
{'আমি': 1, 'বাংলায়': 10, 'গান': 6, 'গাই': 5, 'বাংলার': 9, 'আমার': 0, 'আমিকে': 2, 'চিরদিন': 7, 'এই': 3, 'খুঁজে': 4, 'পাই': 8}




In [None]:
print(cv_corpus.toarray())

[[0 1 0 0 0 1 1 0 0 0 1]
 [0 1 0 0 0 1 1 0 0 1 0]
 [1 1 1 1 1 0 0 1 1 0 1]]


In [None]:
print(cv_corpus.toarray().sum(axis=0))

[1 3 1 1 1 2 2 1 1 1 2]


In [None]:
# Convert sparse matrix to dataframe
cv_corpus = pd.DataFrame.sparse.from_spmatrix(cv_corpus)
cv_corpus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,1,0,0,0,1,1,0,0,0,1
1,0,1,0,0,0,1,1,0,0,1,0
2,1,1,1,1,1,0,0,1,1,0,1


In [None]:
# Save mapping on which index refers to which words
col_map = {v:k for k, v in cv.vocabulary_.items()}
# Rename each column using the mapping
for col in cv_corpus.columns:
    cv_corpus.rename(columns={col: col_map[col]}, inplace=True)
cv_corpus

Unnamed: 0,আমার,আমি,আমিকে,এই,খুঁজে,গাই,গান,চিরদিন,পাই,বাংলার,বাংলায়
0,0,1,0,0,0,1,1,0,0,0,1
1,0,1,0,0,0,1,1,0,0,1,0
2,1,1,1,1,1,0,0,1,1,0,1


### Tf-IDF

In [None]:
tf_idf = TfidfVectorizer(tokenizer=lambda x: x.split())
# tf_idf = TfidfVectorizer(tokenizer=lambda x: x.split())

tf_corpus = tf_idf.fit_transform(sentences)

print(tf_idf.get_feature_names())
print(tf_idf.vocabulary_)

['আমার', 'আমি', 'আমিকে', 'এই', 'খুঁজে', 'গাই', 'গান', 'চিরদিন', 'পাই', 'বাংলার', 'বাংলায়']
{'আমি': 1, 'বাংলায়': 10, 'গান': 6, 'গাই': 5, 'বাংলার': 9, 'আমার': 0, 'আমিকে': 2, 'চিরদিন': 7, 'এই': 3, 'খুঁজে': 4, 'পাই': 8}




In [None]:
print(tf_corpus.toarray())

[[0.         0.40912286 0.         0.         0.         0.52682017
  0.52682017 0.         0.         0.         0.52682017]
 [0.         0.37311881 0.         0.         0.         0.4804584
  0.4804584  0.         0.         0.63174505 0.        ]
 [0.37994462 0.22440141 0.37994462 0.37994462 0.37994462 0.
  0.         0.37994462 0.37994462 0.         0.28895767]]


In [None]:
# Convert sparse matrix to dataframe
tf_corpus = pd.DataFrame.sparse.from_spmatrix(tf_corpus)
# Save mapping on which index refers to which words
col_map = {v:k for k, v in tf_idf.vocabulary_.items()}
# Rename each column using the mapping
for col in tf_corpus.columns:
    tf_corpus.rename(columns={col: col_map[col]}, inplace=True)
tf_corpus

Unnamed: 0,আমার,আমি,আমিকে,এই,খুঁজে,গাই,গান,চিরদিন,পাই,বাংলার,বাংলায়
0,0.0,0.409123,0.0,0.0,0.0,0.52682,0.52682,0.0,0.0,0.0,0.52682
1,0.0,0.373119,0.0,0.0,0.0,0.480458,0.480458,0.0,0.0,0.631745,0.0
2,0.379945,0.224401,0.379945,0.379945,0.379945,0.0,0.0,0.379945,0.379945,0.0,0.288958
