In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
from nltk.tokenize import word_tokenize

# Training and exporting Tfidf model

In [2]:
df= pd.read_csv("./Data/Preprocessed/Cleaned_Nepali_dataset.csv")

corpus= df['Text'].to_numpy()
print(corpus)

['‡§ó‡•Å‡§†‡•Ä ‡§µ‡§ø‡§ß‡•á‡§ï ‡§≤‡•ç‡§Ø‡§æ‡§è‡§∞ ‡§†‡§Æ‡•á‡§≤ ‡§∞‡§æ‡§ú ‡§ó‡•Å‡§†‡§ø ‡§ú‡§ó‡•ç‡§ó‡§æ ‡§õ‡§æ‡§Ø‡§æ ‡§∏‡•á‡§®‡•ç‡§ü‡§∞ ‡§ú‡§∏‡•ç‡§§‡§æ ‡§ú‡•ã‡§ó‡§æ‡§â‡§® ‡§≤‡•ç‡§Ø‡§æ‡§â‡§¶‡•à‡§õ ‡§µ‡§ø‡§ß‡•á‡§ï ‡•§'
 '‡§¶‡§≤‡•á ‡§¶‡•á‡§∂ ‡§∏‡§ï‡•á‡§õ‡§® ‡§¨‡•á‡§ö‡•á ‡§ñ‡§æ‡§® ‡§∏‡•Å‡§∞‡•Ç ‡§ó‡§∞‡•á‡§õ‡§® ‡§¶‡§≤‡•á ‡§≤‡§ñ‡•á‡§ü‡§®‡•Å ‡§™‡§õ ‡•§'
 '‡§®‡•á‡§™‡§æ‡§≤ ‡§∏‡§∏‡§ï‡•É‡§§‡•Ä ‡§ß‡•ç‡§µ‡§∏‡•ç‡§§ ‡§™‡§æ‡§∞‡•ç‡§®‡•á ‡§Ø‡•ã‡§ú‡§®‡§æ !' ...
 '‡§≠‡§æ‡§≤‡•Å ‡§Ü‡§á‡§Æ‡§æ‡§à ‡§ö‡§æ‡§π‡§ø‡§®‡•á ‡§¨‡•á‡§≤‡§æ ‡§Æ‡§ú‡•ç‡§ú‡§æ ‡§•‡§æ‡§™‡•ç‡§®‡•á ‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞ ‡•§'
 '‡§ï‡§Æ‡•ç‡§Ø‡•Å‡§®‡§ø‡§∏‡•ç‡§ü ‡§ú‡§§‡§æ ‡§π‡§ø‡§Ç‡§∏‡§æ , ‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞ ‡§π‡§§‡•ç‡§Ø‡§æ ?' '‡§§‡§™‡§æ‡§à‡§Ç ‡§ï‡•Å‡§µ‡§æ ‡§¶‡•Å‡§¨‡•á‡§∞ ‡§Æ‡§∞‡•á ‡•§']


In [3]:
# Use nltk to tokenize nepali word

from tokenizer import nepali_nltk_tokenizer

# Using this we can use the function where ever tfidf model loads to resolve tokenizer issues

In [4]:
vectorizer = TfidfVectorizer(tokenizer=nepali_nltk_tokenizer)  # Using Nltk tokenizer such that each nepali word is tokenized correctly
vectorizer_model = vectorizer.fit_transform(corpus)

# get idf values
print('\nidf values:')
for ele1, ele2 in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
    print(ele1, ':', ele2)




idf values:
! : 3.782427171240069
# : 7.349138991379798
% : 4.951243718581427
& : 7.166817434585844
' : 7.166817434585844
( : 6.473670254025898
) : 6.393627546352362
* : 8.265429723253952
, : 3.199675129936618
- : 6.879135362134062
-- : 8.265429723253952
. : 4.641088790277587
.. : 5.320990744087513
... : 4.739069198637791
/ : 6.014137924647458
00‡•≠ : 8.265429723253952
1 : 6.319519574198639
10 : 7.349138991379798
100 : 5.375057965357788
1000 : 8.265429723253952
1000000000000000 : 8.265429723253952
11 : 8.265429723253952
12 : 7.859964615145788
14 : 8.265429723253952
17‡§π‡§ú‡§∞ : 7.859964615145788
19- : 8.265429723253952
1‡•¶‡•¶ : 8.265429723253952
2 : 6.393627546352362
20 : 8.265429723253952
2047 : 8.265429723253952
21 : 8.265429723253952
22 : 8.265429723253952
24 : 7.166817434585844
25 : 7.859964615145788
3 : 5.867534450455582
30 : 8.265429723253952
334 : 8.265429723253952
35 : 8.265429723253952
392 : 8.265429723253952
4 : 8.265429723253952
40 : 8.265429723253952
45 : 8.2654297232539

In [5]:
# display tf-idf values
print('\ntf-idf value:')
print(vectorizer_model)


tf-idf value:
  (0, 2126)	0.22482936098107206
  (0, 7215)	0.5261111339323292
  (0, 7145)	0.2408075609408939
  (0, 3066)	0.2779380241895994
  (0, 6673)	0.23467589826688548
  (0, 2125)	0.24430588828282968
  (0, 2631)	0.22273138107621307
  (0, 2550)	0.3033797528561161
  (0, 8001)	0.2884972956326814
  (0, 2739)	0.1943517123769339
  (0, 2900)	0.257397519325033
  (0, 7136)	0.3033797528561161
  (0, 8516)	0.04145219621970836
  (1, 8516)	0.04259875855984148
  (1, 3509)	0.6235423944904308
  (1, 3718)	0.1277355380686647
  (1, 7420)	0.3117711972452154
  (1, 5465)	0.2856257536825735
  (1, 1708)	0.23104811177755047
  (1, 7971)	0.3117711972452154
  (1, 1972)	0.2964770931963486
  (1, 6862)	0.2964770931963486
  (1, 4240)	0.2964770931963486
  (2, 4191)	0.22480452661671965
  (2, 7710)	0.5238630505564255
  :	:
  (2855, 5380)	0.2866052142413007
  (2855, 6553)	0.27611519475068563
  (2855, 7921)	0.30139006631974186
  (2855, 8305)	0.30139006631974186
  (2855, 6295)	0.30139006631974186
  (2856, 8516)	0.063119

here (0, 714) represent (document index, term index) and the corresponding values is the term's tf-idf value for that document

## saving using joblib dump

In [6]:
# Save the vectorizer using joblib
dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

## Loading using joblib load

In [7]:
# let's try loading and see if it works

loaded_vectorizer = load('tfidf_vectorizer.joblib')

In [8]:
print(type(loaded_vectorizer))


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [9]:
print(loaded_vectorizer.get_feature_names_out())


['!' '#' '%' ... 'ü§£' 'ü§£ü§£ü§£' 'ü§Æ']


In [10]:
sample_text = ["‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä ‡§¨‡§æ‡§â ‡§õ‡•ã‡§∞‡•ã ‡§ï‡§§‡§æ ‡§ó‡§Ø‡•ã ‡§Æ‡•Å‡§ú‡§ø , ‡§∞‡§¨‡§ø ‡§¶‡§æ‡§ú‡•Å ‡§π‡§ø‡§∞‡§æ‡§∏‡§§ ‡§π‡•Å‡§¶‡§æ ‡§ñ‡•Å‡§¨ 10 ‡§≤‡§æ‡§ñ ‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã ‡§≠‡§®‡•á‡§∞ ‡§Ü‡§∞‡•ã‡§™ ‡§≤‡§ó‡§æ‡§à‡§∞‡§æ ‡§∞‡§£‡•ç‡§°‡§ø ‡§õ‡•ã‡§∞‡•ã ‡•§"]
tfidf_matrix = loaded_vectorizer.transform(sample_text)
print(tfidf_matrix)


  (0, 8)	0.10565794845128053
  (0, 17)	0.24267930873591032
  (0, 895)	0.22715910208551066
  (0, 1222)	0.2137700517127205
  (0, 1760)	0.23665878450547873
  (0, 1907)	0.22715910208551066
  (0, 2613)	0.41735954201123204
  (0, 3527)	0.1908813189199623
  (0, 5104)	0.22715910208551066
  (0, 5162)	0.20867977100561602
  (0, 5689)	0.16599067252452962
  (0, 6180)	0.27293656767102703
  (0, 6332)	0.1634973028021022
  (0, 6578)	0.25004783487826887
  (0, 6593)	0.1152619641690778
  (0, 6878)	0.27293656767102703
  (0, 6946)	0.21979057594315213
  (0, 8335)	0.20867977100561602
  (0, 8361)	0.2166432945299772
  (0, 8516)	0.037292601276522654


In [11]:
# let's check the terms that we got as output

feature_names = loaded_vectorizer.get_feature_names_out()
indices = [8, 17, 895, 1222, 1760, 1907, 2613, 3527, 
    5104, 5162, 5689, 6180, 6332, 6578, 6593, 
    6878, 6946, 8335, 8361, 8516]
terms = [feature_names[i] for i in indices]
print(terms)


[',', '10', '‡§Ü‡§∞‡•ã‡§™', '‡§ï‡§§‡§æ', '‡§ñ‡•Å‡§¨', '‡§ó‡§Ø‡•ã', '‡§õ‡•ã‡§∞‡•ã', '‡§¶‡§æ‡§ú‡•Å', '‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä', '‡§¨‡§æ‡§â', '‡§≠‡§®‡•á‡§∞', '‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã', '‡§Æ‡•Å‡§ú‡§ø', '‡§∞‡§£‡•ç‡§°‡§ø', '‡§∞‡§¨‡§ø', '‡§≤‡§ó‡§æ‡§à‡§∞‡§æ', '‡§≤‡§æ‡§ñ', '‡§π‡§ø‡§∞‡§æ‡§∏‡§§', '‡§π‡•Å‡§¶‡§æ', '‡•§']


In [12]:
print("Stopwords used:", loaded_vectorizer.get_stop_words())
print("Analyzer:", loaded_vectorizer.analyzer)
print("Tokenizer:", loaded_vectorizer.build_tokenizer())



Stopwords used: None
Analyzer: word
Tokenizer: <function nepali_nltk_tokenizer at 0x7fbcb98d5260>


# The tfidf model is ready to be used

In [13]:
tfidf_scores = loaded_vectorizer.transform(sample_text).toarray().flatten()
print(tfidf_scores[:10])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.10565795 0.        ]


In [14]:
tfidf_scores[0]

0.0

In [15]:
tfidf_scores[17]

0.24267930873591032

In [16]:
# Testing if we can use tf-idf for sentence embedding
# Get the TF-IDF weights for each word
words = word_tokenize("‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä ‡§¨‡§æ‡§â ‡§õ‡•ã‡§∞‡•ã ‡§ï‡§§‡§æ ‡§ó‡§Ø‡•ã ‡§Æ‡•Å‡§ú‡§ø , ‡§∞‡§¨‡§ø ‡§¶‡§æ‡§ú‡•Å ‡§π‡§ø‡§∞‡§æ‡§∏‡§§ ‡§π‡•Å‡§¶‡§æ ‡§ñ‡•Å‡§¨ 10 ‡§≤‡§æ‡§ñ ‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã ‡§≠‡§®‡•á‡§∞ ‡§Ü‡§∞‡•ã‡§™ ‡§≤‡§ó‡§æ‡§à‡§∞‡§æ ‡§∞‡§£‡•ç‡§°‡§ø ‡§õ‡•ã‡§∞‡•ã ‡•§")
tfidf_scores = loaded_vectorizer.transform(["‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä ‡§¨‡§æ‡§â ‡§õ‡•ã‡§∞‡•ã ‡§ï‡§§‡§æ ‡§ó‡§Ø‡•ã ‡§Æ‡•Å‡§ú‡§ø , ‡§∞‡§¨‡§ø ‡§¶‡§æ‡§ú‡•Å ‡§π‡§ø‡§∞‡§æ‡§∏‡§§ ‡§π‡•Å‡§¶‡§æ ‡§ñ‡•Å‡§¨ 10 ‡§≤‡§æ‡§ñ ‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã ‡§≠‡§®‡•á‡§∞ ‡§Ü‡§∞‡•ã‡§™ ‡§≤‡§ó‡§æ‡§à‡§∞‡§æ ‡§∞‡§£‡•ç‡§°‡§ø ‡§õ‡•ã‡§∞‡•ã ‡•§"]).toarray().flatten()

for word, score in zip(words, tfidf_scores):
    print(word, score )

‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä 0.0
‡§¨‡§æ‡§â 0.0
‡§õ‡•ã‡§∞‡•ã 0.0
‡§ï‡§§‡§æ 0.0
‡§ó‡§Ø‡•ã 0.0
‡§Æ‡•Å‡§ú‡§ø 0.0
, 0.0
‡§∞‡§¨‡§ø 0.0
‡§¶‡§æ‡§ú‡•Å 0.10565794845128053
‡§π‡§ø‡§∞‡§æ‡§∏‡§§ 0.0
‡§π‡•Å‡§¶‡§æ 0.0
‡§ñ‡•Å‡§¨ 0.0
10 0.0
‡§≤‡§æ‡§ñ 0.0
‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã 0.0
‡§≠‡§®‡•á‡§∞ 0.0
‡§Ü‡§∞‡•ã‡§™ 0.0
‡§≤‡§ó‡§æ‡§à‡§∞‡§æ 0.24267930873591032
‡§∞‡§£‡•ç‡§°‡§ø 0.0
‡§õ‡•ã‡§∞‡•ã 0.0
‡•§ 0.0


In [17]:
feature_names = loaded_vectorizer.get_feature_names_out("‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä ‡§¨‡§æ‡§â ‡§õ‡•ã‡§∞‡•ã ‡§ï‡§§‡§æ ‡§ó‡§Ø‡•ã ‡§Æ‡•Å‡§ú‡§ø , ‡§∞‡§¨‡§ø ‡§¶‡§æ‡§ú‡•Å ‡§π‡§ø‡§∞‡§æ‡§∏‡§§ ‡§π‡•Å‡§¶‡§æ ‡§ñ‡•Å‡§¨ 10 ‡§≤‡§æ‡§ñ ‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã ‡§≠‡§®‡•á‡§∞ ‡§Ü‡§∞‡•ã‡§™ ‡§≤‡§ó‡§æ‡§à‡§∞‡§æ ‡§∞‡§£‡•ç‡§°‡§ø ‡§õ‡•ã‡§∞‡•ã ‡•§")
feature_names[:10]

array(['!', '#', '%', '&', "'", '(', ')', '*', ',', '-'], dtype=object)

## we can see that ',' is at index 8, we can use this logic to find the index of a word

In [18]:
for word in words:
    if word in feature_names:
        index= feature_names.tolist().index(word)
        print(word, tfidf_scores[index])

‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡•Ä 0.22715910208551066
‡§¨‡§æ‡§â 0.20867977100561602
‡§õ‡•ã‡§∞‡•ã 0.41735954201123204
‡§ï‡§§‡§æ 0.2137700517127205
‡§ó‡§Ø‡•ã 0.22715910208551066
‡§Æ‡•Å‡§ú‡§ø 0.1634973028021022
, 0.10565794845128053
‡§∞‡§¨‡§ø 0.1152619641690778
‡§¶‡§æ‡§ú‡•Å 0.1908813189199623
‡§π‡§ø‡§∞‡§æ‡§∏‡§§ 0.20867977100561602
‡§π‡•Å‡§¶‡§æ 0.2166432945299772
‡§ñ‡•Å‡§¨ 0.23665878450547873
10 0.24267930873591032
‡§≤‡§æ‡§ñ 0.21979057594315213
‡§Æ‡§æ‡§ó‡•ç‡§Ø‡•ã 0.27293656767102703
‡§≠‡§®‡•á‡§∞ 0.16599067252452962
‡§Ü‡§∞‡•ã‡§™ 0.22715910208551066
‡§≤‡§ó‡§æ‡§à‡§∞‡§æ 0.27293656767102703
‡§∞‡§£‡•ç‡§°‡§ø 0.25004783487826887
‡§õ‡•ã‡§∞‡•ã 0.41735954201123204
‡•§ 0.037292601276522654


## Ok we have a working logic that can be used to weight the word vectors of embedders 