In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
from nltk.tokenize import word_tokenize

# Training and exporting Tfidf model

In [2]:
df= pd.read_csv("./Data/Preprocessed/Cleaned_Nepali_dataset.csv")

corpus= df['Text'].to_numpy()
print(corpus)

['गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन्टर जस्ता जोगाउन ल्याउदैछ विधेक ।'
 'दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।'
 'नेपाल ससकृती ध्वस्त पार्ने योजना !' ...
 'भालु आइमाई चाहिने बेला मज्जा थाप्ने बलात्कार ।'
 'कम्युनिस्ट जता हिंसा , बलात्कार हत्या ?' 'तपाईं कुवा दुबेर मरे ।']


In [3]:
# Use nltk to tokenize nepali word

from tokenizer import nepali_nltk_tokenizer

# Using this we can use the function where ever tfidf model loads to resolve tokenizer issues

In [4]:
vectorizer = TfidfVectorizer(tokenizer=nepali_nltk_tokenizer)  # Using Nltk tokenizer such that each nepali word is tokenized correctly
vectorizer_model = vectorizer.fit_transform(corpus)

# get idf values
print('\nidf values:')
for ele1, ele2 in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
    print(ele1, ':', ele2)




idf values:
! : 3.782427171240069
# : 7.349138991379798
% : 4.951243718581427
& : 7.166817434585844
' : 7.166817434585844
( : 6.473670254025898
) : 6.393627546352362
* : 8.265429723253952
, : 3.199675129936618
- : 6.879135362134062
-- : 8.265429723253952
. : 4.641088790277587
.. : 5.320990744087513
... : 4.739069198637791
/ : 6.014137924647458
00७ : 8.265429723253952
1 : 6.319519574198639
10 : 7.349138991379798
100 : 5.375057965357788
1000 : 8.265429723253952
1000000000000000 : 8.265429723253952
11 : 8.265429723253952
12 : 7.859964615145788
14 : 8.265429723253952
17हजर : 7.859964615145788
19- : 8.265429723253952
1०० : 8.265429723253952
2 : 6.393627546352362
20 : 8.265429723253952
2047 : 8.265429723253952
21 : 8.265429723253952
22 : 8.265429723253952
24 : 7.166817434585844
25 : 7.859964615145788
3 : 5.867534450455582
30 : 8.265429723253952
334 : 8.265429723253952
35 : 8.265429723253952
392 : 8.265429723253952
4 : 8.265429723253952
40 : 8.265429723253952
45 : 8.265429723253952
5 : 7.349

In [5]:
# display tf-idf values
print('\ntf-idf value:')
print(vectorizer_model)


tf-idf value:
  (0, 2126)	0.22482936098107206
  (0, 7215)	0.5261111339323292
  (0, 7145)	0.2408075609408939
  (0, 3066)	0.2779380241895994
  (0, 6673)	0.23467589826688548
  (0, 2125)	0.24430588828282968
  (0, 2631)	0.22273138107621307
  (0, 2550)	0.3033797528561161
  (0, 8001)	0.2884972956326814
  (0, 2739)	0.1943517123769339
  (0, 2900)	0.257397519325033
  (0, 7136)	0.3033797528561161
  (0, 8516)	0.04145219621970836
  (1, 8516)	0.04259875855984148
  (1, 3509)	0.6235423944904308
  (1, 3718)	0.1277355380686647
  (1, 7420)	0.3117711972452154
  (1, 5465)	0.2856257536825735
  (1, 1708)	0.23104811177755047
  (1, 7971)	0.3117711972452154
  (1, 1972)	0.2964770931963486
  (1, 6862)	0.2964770931963486
  (1, 4240)	0.2964770931963486
  (2, 4191)	0.22480452661671965
  (2, 7710)	0.5238630505564255
  :	:
  (2855, 5380)	0.2866052142413007
  (2855, 6553)	0.27611519475068563
  (2855, 7921)	0.30139006631974186
  (2855, 8305)	0.30139006631974186
  (2855, 6295)	0.30139006631974186
  (2856, 8516)	0.063119

here (0, 714) represent (document index, term index) and the corresponding values is the term's tf-idf value for that document

## saving using joblib dump

In [6]:
# Save the vectorizer using joblib
dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

## Loading using joblib load

In [7]:
# let's try loading and see if it works

loaded_vectorizer = load('tfidf_vectorizer.joblib')

In [8]:
print(type(loaded_vectorizer))


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [9]:
print(loaded_vectorizer.get_feature_names_out())


['!' '#' '%' ... '🤣' '🤣🤣🤣' '🤮']


In [10]:
sample_text = ["बलात्कारी बाउ छोरो कता गयो मुजि , रबि दाजु हिरासत हुदा खुब 10 लाख माग्यो भनेर आरोप लगाईरा रण्डि छोरो ।"]
tfidf_matrix = loaded_vectorizer.transform(sample_text)
print(tfidf_matrix)


  (0, 8)	0.10565794845128053
  (0, 17)	0.24267930873591032
  (0, 895)	0.22715910208551066
  (0, 1222)	0.2137700517127205
  (0, 1760)	0.23665878450547873
  (0, 1907)	0.22715910208551066
  (0, 2613)	0.41735954201123204
  (0, 3527)	0.1908813189199623
  (0, 5104)	0.22715910208551066
  (0, 5162)	0.20867977100561602
  (0, 5689)	0.16599067252452962
  (0, 6180)	0.27293656767102703
  (0, 6332)	0.1634973028021022
  (0, 6578)	0.25004783487826887
  (0, 6593)	0.1152619641690778
  (0, 6878)	0.27293656767102703
  (0, 6946)	0.21979057594315213
  (0, 8335)	0.20867977100561602
  (0, 8361)	0.2166432945299772
  (0, 8516)	0.037292601276522654


In [11]:
# let's check the terms that we got as output

feature_names = loaded_vectorizer.get_feature_names_out()
indices = [8, 17, 895, 1222, 1760, 1907, 2613, 3527, 
    5104, 5162, 5689, 6180, 6332, 6578, 6593, 
    6878, 6946, 8335, 8361, 8516]
terms = [feature_names[i] for i in indices]
print(terms)


[',', '10', 'आरोप', 'कता', 'खुब', 'गयो', 'छोरो', 'दाजु', 'बलात्कारी', 'बाउ', 'भनेर', 'माग्यो', 'मुजि', 'रण्डि', 'रबि', 'लगाईरा', 'लाख', 'हिरासत', 'हुदा', '।']


In [12]:
print("Stopwords used:", loaded_vectorizer.get_stop_words())
print("Analyzer:", loaded_vectorizer.analyzer)
print("Tokenizer:", loaded_vectorizer.build_tokenizer())



Stopwords used: None
Analyzer: word
Tokenizer: <function nepali_nltk_tokenizer at 0x7fbcb98d5260>


# The tfidf model is ready to be used

In [13]:
tfidf_scores = loaded_vectorizer.transform(sample_text).toarray().flatten()
print(tfidf_scores[:10])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.10565795 0.        ]


In [14]:
tfidf_scores[0]

0.0

In [15]:
tfidf_scores[17]

0.24267930873591032

In [16]:
# Testing if we can use tf-idf for sentence embedding
# Get the TF-IDF weights for each word
words = word_tokenize("बलात्कारी बाउ छोरो कता गयो मुजि , रबि दाजु हिरासत हुदा खुब 10 लाख माग्यो भनेर आरोप लगाईरा रण्डि छोरो ।")
tfidf_scores = loaded_vectorizer.transform(["बलात्कारी बाउ छोरो कता गयो मुजि , रबि दाजु हिरासत हुदा खुब 10 लाख माग्यो भनेर आरोप लगाईरा रण्डि छोरो ।"]).toarray().flatten()

for word, score in zip(words, tfidf_scores):
    print(word, score )

बलात्कारी 0.0
बाउ 0.0
छोरो 0.0
कता 0.0
गयो 0.0
मुजि 0.0
, 0.0
रबि 0.0
दाजु 0.10565794845128053
हिरासत 0.0
हुदा 0.0
खुब 0.0
10 0.0
लाख 0.0
माग्यो 0.0
भनेर 0.0
आरोप 0.0
लगाईरा 0.24267930873591032
रण्डि 0.0
छोरो 0.0
। 0.0


In [17]:
feature_names = loaded_vectorizer.get_feature_names_out("बलात्कारी बाउ छोरो कता गयो मुजि , रबि दाजु हिरासत हुदा खुब 10 लाख माग्यो भनेर आरोप लगाईरा रण्डि छोरो ।")
feature_names[:10]

array(['!', '#', '%', '&', "'", '(', ')', '*', ',', '-'], dtype=object)

## we can see that ',' is at index 8, we can use this logic to find the index of a word

In [18]:
for word in words:
    if word in feature_names:
        index= feature_names.tolist().index(word)
        print(word, tfidf_scores[index])

बलात्कारी 0.22715910208551066
बाउ 0.20867977100561602
छोरो 0.41735954201123204
कता 0.2137700517127205
गयो 0.22715910208551066
मुजि 0.1634973028021022
, 0.10565794845128053
रबि 0.1152619641690778
दाजु 0.1908813189199623
हिरासत 0.20867977100561602
हुदा 0.2166432945299772
खुब 0.23665878450547873
10 0.24267930873591032
लाख 0.21979057594315213
माग्यो 0.27293656767102703
भनेर 0.16599067252452962
आरोप 0.22715910208551066
लगाईरा 0.27293656767102703
रण्डि 0.25004783487826887
छोरो 0.41735954201123204
। 0.037292601276522654


## Ok we have a working logic that can be used to weight the word vectors of embedders 