# Topic Modelling

### Loading Data

In [1]:
import pandas as pd
import nltk

In [2]:
df = pd.DataFrame(columns=['news'])
df

Unnamed: 0,news


In [6]:
''' 
Get the file paths of all the news
'''
import os

# Specify the root directory from where you want to start traversing
root_directory = '../data/newscorpus'

# Initialize an empty list to store file paths
file_paths = []

# Recursively traverse the directory tree using os.walk()
for dirpath, dirnames, filenames in os.walk(root_directory):
    for filename in filenames:
        # Get the full file path by joining the directory path and the file name
        file_path = os.path.join(dirpath, filename)
        # Append the file path to the list
        file_paths.append(file_path)

# Display the list of file paths
print(file_paths)

['../data/newscorpus/Manoranjan/Manoranjan/324.txt', '../data/newscorpus/Manoranjan/Manoranjan/241.txt', '../data/newscorpus/Manoranjan/Manoranjan/752.txt', '../data/newscorpus/Manoranjan/Manoranjan/444.txt', '../data/newscorpus/Manoranjan/Manoranjan/257.txt', '../data/newscorpus/Manoranjan/Manoranjan/895.txt', '../data/newscorpus/Manoranjan/Manoranjan/75.txt', '../data/newscorpus/Manoranjan/Manoranjan/528.txt', '../data/newscorpus/Manoranjan/Manoranjan/981.txt', '../data/newscorpus/Manoranjan/Manoranjan/251.txt', '../data/newscorpus/Manoranjan/Manoranjan/995.txt', '../data/newscorpus/Manoranjan/Manoranjan/805.txt', '../data/newscorpus/Manoranjan/Manoranjan/440.txt', '../data/newscorpus/Manoranjan/Manoranjan/572.txt', '../data/newscorpus/Manoranjan/Manoranjan/29.txt', '../data/newscorpus/Manoranjan/Manoranjan/523.txt', '../data/newscorpus/Manoranjan/Manoranjan/755.txt', '../data/newscorpus/Manoranjan/Manoranjan/247.txt', '../data/newscorpus/Manoranjan/Manoranjan/218.txt', '../data/news

In [7]:
'''
Read all the news from the file path
Load it in dataframe
'''
for path in file_paths:
    with open(path,"r") as f:
        queries = f.read()
        df.loc[len(df)] = [queries]
df

Unnamed: 0,news
0,फिल्मी अनुराग वर्ष अघिसम्म पनि उनका लागि बलिउड...
1,पिरती गाउँदै जंगी जर्नेलनेपाली सेनाका प्रवक्ता...
2,"भान्सामा औषधि जाइफल""प्राचीनकालदेखि नै जाइफल वि..."
3,कट्रिना विश्वकै सेक्सीबलिउड नायिका सुन्दर छिन्...
4,तीन सम्मान पुरस्कारको घोषणागर्गाचार्य सामुदायि...
...,...
9995,आईटीसीले नेपालमा लगानी बढाउने मजदुर आन्दोलनका ...
9996,कोषले निगमलाई आज पैसा दिनेसरकार जमानत बसेको प...
9997,थानकोटमा ट्रक नियन्त्रणएउटै विभागको पथलैयास्थि...
9998,महिनादेखि ट्रेजरी र अन्तरबैंक दर प्रतिशतरकम ...


### Preprocessing

In [9]:
nepali_stopwords = open("../resources/stopwords.txt", "r")
stopwords = nepali_stopwords.read().split()
# print(stopwords)
print(len(stopwords))

501


In [10]:
'''
Remove unwanted characters
Remove stop words 
'''
import re
def string_manipulation(df)  : 
    df['news'] = df['news'].apply(lambda x: re.sub('[।(),०-९<<?!,—,–,/,’,‘,:,\u200d]', '', x))
    df['news'] = df['news'].apply(lambda x: " ".join([i.replace('\n', '').replace('\t', '').replace("\"",'') for i in x.split() if i not in (stopwords) and i != ' ']))
    return df

processed_data = string_manipulation(df)
# processed_data

In [11]:
from nepalitokenizer import NepaliTokenizer

In [12]:
'''
tokenizing the corpus 
'''
tokenize = NepaliTokenizer()
processed_data["news"] = processed_data["news"].apply(tokenize.tokenizer)
processed_data

Unnamed: 0,news
0,"[फिल्मी, अनुराग, वर्ष, अघिसम्म, उनका, बलिउड, स..."
1,"[पिरती, गाउँदै, जंगी, जर्नेलनेपाली, सेनाका, प्..."
2,"[भान्सामा, औषधि, जाइफलप्राचीनकालदेखि, जाइफल, र..."
3,"[कट्रिना, विश्वकै, सेक्सीबलिउड, नायिका, सुन्दर..."
4,"[सम्मान, पुरस्कारको, घोषणागर्गाचार्य, सामुदायि..."
...,...
9995,"[आईटीसीले, नेपालमा, लगानी, बढाउने, मजदुर, आन्द..."
9996,"[कोषले, निगमलाई, पैसा, दिनेसरकार, जमानत, बसेको..."
9997,"[थानकोटमा, ट्रक, नियन्त्रणएउटै, विभागको, पथलैय..."
9998,"[महिनादेखि, ट्रेजरी, अन्तरबैंक, दर, प्रतिशतरकम..."


In [20]:
import snowballstemmer

In [21]:
'''
Stemming & StopWord removal after Stemming
'''
stemmer = snowballstemmer.NepaliStemmer()

def get_stem(words):
    global stemmer
    new_list = stemmer.stemWords(words)
    return new_list

def clean_data(words):
    new_list = []
    for word in words:
        if len(word)>2 and word not in stopwords:
            new_list.append(word)

    return new_list
        
# ans = get_stem(["फिल्मी", "अनुराग", "वर्ष", "अघिसम्म", "उनका", "बलिउड"])
# ans

In [22]:
processed_data['news'] = processed_data['news'].apply(lambda x : get_stem(x))

In [23]:
processed_data['news'] = processed_data['news'].apply(lambda x : clean_data(x))

In [24]:
processed_data_list = processed_data['news']

### Visualization and Analysis of Data

In [25]:
'''
Finding the frequency Distribution of Words
'''
# words_list = []
# for sentence in processed_data_list:
#     words_list.extend(sentence)
# freq_dist = nltk.FreqDist(words_list)
# freq_dist.most_common(20)

'\nFinding the frequency Distribution of Words\n'

In [26]:
'''
Visualization of Most Frequency 
'''
# %config InlineBackend.figure_format = 'retina'
# import matplotlib.pyplot as plt
# from matplotlib.font_manager import FontProperties
# import seaborn as sns
# sns.set()
# nepali_font = FontProperties(fname = 'Mangal.ttf')
# temp = pd.DataFrame(freq_dist.most_common(30),  columns=['word', 'count'])
# fig, ax = plt.subplots(figsize=(10, 6))
# sns.barplot(x='word', y='count', 
#             data=temp, ax=ax)
# plt.title("Top words")
# plt.xticks(rotation='vertical',fontproperties=nepali_font);

'\nVisualization of Most Frequency \n'

In [27]:
'''
Forming Word Cloud
'''
# %config InlineBackend.figure_format = 'retina'

# from wordcloud import WordCloud
# import wordcloud
# # creation of wordcloud
# wcloud_fig = WordCloud(colormap='viridis', width=300, height=200, font_path="./Mangal.ttf").generate_from_frequencies(freq_dist)

# # plotting the wordcloud
# plt.figure(figsize=(10,7), frameon=True )

# plt.imshow(wcloud_fig, interpolation  = 'bilinear')
# plt.show()

'\nForming Word Cloud\n'

### Preparation for LDA

In [28]:
from gensim import corpora

In [29]:
from gensim.test.utils import datapath

In [30]:
'''
Representing the Corpus in dictionary: 
{unique_id : word}
'''
id2word = corpora.Dictionary(processed_data_list)
# print(id2word)

In [31]:
'''
Checking dictionary created
'''
# count = 0
# for k, v in id2word.iteritems():
#     print(k, v)
#     count += 1
#     if count > 10:
#         break

'\nChecking dictionary created\n'

In [32]:
'''
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
# id2word.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
id2word.filter_extremes(no_below=15, no_above=0.1, keep_n=None)

In [33]:
'''
bag-of-words format = list of (token_id, token_count) 2-tuples
'''
#  now lets create a encoded bag of words 
bow_corpus = [id2word.doc2bow(sent) for sent in processed_data_list]
# bow_corpus[0] 

In [34]:
'''
Preview BOW for our sample preprocessed document
'''
# Here document_num is document number 4310 which we have checked in Step 2
# document_num = 4310
# bow_doc_4310 = bow_corpus[document_num]
# count = 0

# for i in range(len(bow_doc_4310)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
#                                                      id2word[bow_doc_4310[i][0]], 
#                                                      bow_doc_4310[i][1]))
#     count += 1
#     if count > 10:
#         break

'\nPreview BOW for our sample preprocessed document\n'

### LDA - (BOW)

In [35]:
# Creating the object for LDA model using gensim library
import gensim
from gensim.models import CoherenceModel

In [36]:
'''
Loading the Saved Model
'''
#loading model from disk
from gensim import  models

temp_file = datapath("./lda_model")
lda_model = models.ldamodel.LdaModel.load(temp_file)

In [None]:
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# LDA = gensim.models.ldamulticore.LdaMulticore
# lda_model = LDA(corpus=corpus_matrix,id2word=id2word, num_topics=10, random_state=100,update_every=1,chunksize=100,passes=5,alpha='auto',per_word_topics=True)

# lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=id2word, passes=50, workers=5)

In [None]:
'''
For Saving  Model
'''
#saving model to disk.
temp_file = datapath("./lda_model_30")
lda_model.save(temp_file)

In [37]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

Topic: 0.009*"मानिस" + 0.008*"उहाँ" + 0.005*"लेख" + 0.005*"तपाईं" + 0.005*"आँखा" + 0.004*"शब्द" + 0.004*"सपना" + 0.003*"सहर" + 0.003*"कहिल्यै" + 0.003*"गाउँ" 
Words: 0


Topic: 0.055*"अस्पताल" + 0.044*"उपचार" + 0.035*"बिरामी" + 0.034*"स्वास्थ्य" + 0.030*"औषधि" + 0.021*"चिकित्सक" + 0.016*"मृगौला" + 0.014*"केन्द्र" + 0.014*"मेडिकल" + 0.011*"प्रत्यारोपण" 
Words: 1


Topic: 0.028*"रोग" + 0.024*"संक्रमण" + 0.022*"क्यान्सर" + 0.018*"बच्चा" + 0.016*"बालबालिका" + 0.014*"स्वास्थ्य" + 0.013*"उपचार" + 0.013*"शिशु" + 0.009*"मृत्यु" + 0.009*"संक्रमित" 
Words: 2


Topic: 0.029*"पुरुष" + 0.025*"छोरी" + 0.021*"परिवार" + 0.020*"छोरा" + 0.020*"होटल" + 0.020*"गाउँ" + 0.019*"पर्यटक" + 0.019*"स्वर्ण" + 0.017*"विवाह" + 0.013*"थिइन्" 
Words: 3


Topic: 0.072*"फिल्म" + 0.019*"नाटक" + 0.012*"अभिनय" + 0.010*"निर्देशक" + 0.010*"चलचित्र" + 0.010*"दर्शक" + 0.008*"कथा" + 0.007*"कलाकार" + 0.006*"पात्र" + 0.006*"प्रेम" 
Words: 4


Topic: 0.044*"बैंक" + 0.017*"अर्ब" + 0.017*"डलर" + 0.016*"लगानी" + 0.015*"चीन" + 0.014*

In [38]:
'''
Testing random document, to get their topic distribution
Index     Type (document number 324)
'''
document_dict = {
    0    : "Manoranjan",
    1500 : "Sahitya",
    2500 : "Suchana Prabhidhi",
    3500 : "Bichar",
    4550 : "Swasthya",
    5500 : "Prabas",
    6500 : "Khelkud",
    7500 : "Viswa",
    8500 : "Desh",
    9500 : "Artha",
}

for k,v in document_dict.items():
    print("\n{}\n".format(v))
    for index, score in sorted(lda_model[bow_corpus[k]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))


Manoranjan


Score: 0.6847162246704102	 
Topic: 0.072*"फिल्म" + 0.019*"नाटक" + 0.012*"अभिनय" + 0.010*"निर्देशक" + 0.010*"चलचित्र"

Score: 0.09194810688495636	 
Topic: 0.007*"राजनीति" + 0.007*"पहिचान" + 0.006*"जातीय" + 0.006*"प्रदेश" + 0.005*"संघीयता"

Score: 0.08855963498353958	 
Topic: 0.048*"गीत" + 0.023*"संगीत" + 0.016*"एल्बम" + 0.013*"गायक" + 0.007*"म्युजिक"

Score: 0.06027521193027496	 
Topic: 0.009*"मानिस" + 0.008*"उहाँ" + 0.005*"लेख" + 0.005*"तपाईं" + 0.005*"आँखा"

Score: 0.046008456498384476	 
Topic: 0.018*"कोरिया" + 0.018*"पत्रकार" + 0.012*"मेला" + 0.012*"सम्मेलन" + 0.011*"महासंघ"

Score: 0.02542412094771862	 
Topic: 0.014*"महोत्सव" + 0.014*"प्रतियोगिता" + 0.013*"गुरुङ" + 0.013*"मिस" + 0.012*"राई"

Sahitya


Score: 0.29581043124198914	 
Topic: 0.021*"कविता" + 0.017*"साहित्य" + 0.015*"पुस्तक" + 0.013*"चित्र" + 0.011*"पुरस्कार"

Score: 0.193568155169487	 
Topic: 0.018*"कोरिया" + 0.018*"पत्रकार" + 0.012*"मेला" + 0.012*"सम्मेलन" + 0.011*"महासंघ"

Score: 0.1516350656747818	 
Topic

In [39]:
'''
This groups the documents by their topic, for e.g
{
    0 : [list of documents in topic index 0]
}
'''

cluster_by_topic = {}
[cluster_by_topic.setdefault(i, []) for i in range(lda_model.num_topics)]
index = 0
for bow in bow_corpus:
    topics_list = lda_model.get_document_topics(bow,minimum_probability=0.8)
    for topic_id, score in topics_list: 
        cluster_by_topic[topic_id].append(file_paths[index])
    index += 1

In [40]:
'''
# Printing the documents that belong to a certain topic 
# '''
for k,v in cluster_by_topic.items():
    print ("Topic : {}".format(lda_model.print_topic(k)))
    print ("---------------------------------------------------")
    for paths in v:
        print (paths)
    print ("---------------------------------------------------")


Topic : 0.009*"मानिस" + 0.008*"उहाँ" + 0.005*"लेख" + 0.005*"तपाईं" + 0.005*"आँखा" + 0.004*"शब्द" + 0.004*"सपना" + 0.003*"सहर" + 0.003*"कहिल्यै" + 0.003*"गाउँ"
---------------------------------------------------
---------------------------------------------------
Topic : 0.055*"अस्पताल" + 0.044*"उपचार" + 0.035*"बिरामी" + 0.034*"स्वास्थ्य" + 0.030*"औषधि" + 0.021*"चिकित्सक" + 0.016*"मृगौला" + 0.014*"केन्द्र" + 0.014*"मेडिकल" + 0.011*"प्रत्यारोपण"
---------------------------------------------------
---------------------------------------------------
Topic : 0.028*"रोग" + 0.024*"संक्रमण" + 0.022*"क्यान्सर" + 0.018*"बच्चा" + 0.016*"बालबालिका" + 0.014*"स्वास्थ्य" + 0.013*"उपचार" + 0.013*"शिशु" + 0.009*"मृत्यु" + 0.009*"संक्रमित"
---------------------------------------------------
---------------------------------------------------
Topic : 0.029*"पुरुष" + 0.025*"छोरी" + 0.021*"परिवार" + 0.020*"छोरा" + 0.020*"होटल" + 0.020*"गाउँ" + 0.019*"पर्यटक" + 0.019*"स्वर्ण" + 0.017*"विवाह" + 0.013*"थिइन्"

In [41]:
'''
Printing the top 5 documents in each topics
'''
my_ids = [i for i in range(len(bow_corpus))]

top_documents = {}
[top_documents.setdefault(i, []) for i in range(lda_model.num_topics)]

for topic_id in range(lda_model.num_topics):
    tops = sorted(zip(my_ids, lda_model[bow_corpus]), reverse=True, key=lambda x: abs(dict(x[1]).get(topic_id, 0.0)))
    top_five =  tops[ : 5]
    for index, _ in top_five:
        top_documents[topic_id].append(file_paths[index])

In [42]:
'''
Printing the documents that belong to a certain topic 
'''
for k,v in top_documents.items():
    print ("Topic : {}".format(lda_model.print_topic(k)))
    print ("---------------------------------------------------")
    for paths in v:
        print (paths)
    print ("---------------------------------------------------")

Topic : 0.009*"मानिस" + 0.008*"उहाँ" + 0.005*"लेख" + 0.005*"तपाईं" + 0.005*"आँखा" + 0.004*"शब्द" + 0.004*"सपना" + 0.003*"सहर" + 0.003*"कहिल्यै" + 0.003*"गाउँ"
---------------------------------------------------
./newscorpus2/Swasthya/Swasthya/632.txt
./newscorpus2/Viswa/Viswa/642.txt
./newscorpus2/Swasthya/Swasthya/65.txt
./newscorpus2/Swasthya/Swasthya/459.txt
./newscorpus2/SuchanaPrabidhi/SuchanaPrabidhi/106.txt
---------------------------------------------------
Topic : 0.055*"अस्पताल" + 0.044*"उपचार" + 0.035*"बिरामी" + 0.034*"स्वास्थ्य" + 0.030*"औषधि" + 0.021*"चिकित्सक" + 0.016*"मृगौला" + 0.014*"केन्द्र" + 0.014*"मेडिकल" + 0.011*"प्रत्यारोपण"
---------------------------------------------------
./newscorpus2/Swasthya/Swasthya/408.txt
./newscorpus2/Swasthya/Swasthya/797.txt
./newscorpus2/Desh/Desh/574.txt
./newscorpus2/Desh/Desh/994.txt
./newscorpus2/Swasthya/Swasthya/852.txt
---------------------------------------------------
Topic : 0.028*"रोग" + 0.024*"संक्रमण" + 0.022*"क्यान्सर" 

In [43]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# fig, ax = plt.subplots(figsize=(10, 10))
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus = bow_corpus, dictionary = id2word)
# vis

### LDA - (TF-IDF)

In [None]:
'''
Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
'''
# from gensim import corpora, models

# tfidf = models.TfidfModel(bow_corpus)

In [None]:
'''
Apply transformation to the entire corpus and call it 'corpus_tfidf'
'''
# corpus_tfidf = tfidf[bow_corpus]

In [None]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''
# count = 0
# from pprint import pprint
# for doc in corpus_tfidf:
#     pprint(doc)
#     count +=1
#     if count > 10 :
#         break

In [None]:
# import operator

In [None]:
'''
This took more than 20 minutes so i had to stop
It attempts to keep training lda model until the thresold is reached
'''
def ret_top_model():
    """
    Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
    quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
    evaluating the topic model until this threshold is crossed. 
    
    Returns:
    -------
    lm: Final evaluated topic model
    top_topics: ranked topics in decreasing order. List of tuples
    """
    # top_topics = [(0, 0)]
    # while top_topics[0][1] < 0.97:

    #     lm = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=id2word, passes=2, workers=2)
    #     coherence_values = {}
    #     for n, topic in lm.show_topics(num_topics=-1, formatted=False):
    #         topic = [word for word, _ in topic]
    #         cm = CoherenceModel(topics=[topic], texts=processed_data_list, dictionary=id2word, window_size=10)
    #         coherence_values[n] = cm.get_coherence()
    #     top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
    #     print(top_topics[0][1])
    # return lm, top_topics

In [None]:
# lm, top_topics = ret_top_model()

### HDP

In [None]:
# from gensim.models import HdpModel

In [None]:
# hdp_model = gensim.models.HdpModel(bow_corpus, id2word=id2word)

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
# for idx, topic in hdp_model.print_topics(-1):
#     print("Topic: {} \nWords: {}".format(topic, idx ))
#     print("\n")

In [None]:
# document_dict = {
#     0 : "Manoranjan",
#     1000 : "Sahitya",
#     2000 : "Suchana Prabhidhi",
#     3000 : "Bichar",
#     4350 : "Swasthya",
#     5000 : "Prabas",
#     6000 : "Khelkud",
#     7000 : "Viswa",
#     8000 : "Desh",
#     9000 : "Artha",
# }
# document_num = 9200 
# print()
# # Our test document is document number 4310
# for k,v in document_dict.items():
#     print("\n{}\n".format(v))
#     for index, score in sorted(hdp_model[bow_corpus[k]], key=lambda tup: -1*tup[1]):
#         print("\nScore: {}\t \nTopic: {}".format(score, hdp_model.print_topic(index, 5)))

In [None]:
# '''
# Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
# '''

# lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
#                                        num_topics=25, 
#                                        id2word = id2word, 
#                                        passes = 2, 
#                                        workers=2)

In [None]:
# '''
# For each topic, we will explore the words occuring in that topic and its relative weight
# '''
# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print("Topic: {} Word: {}".format(idx, topic))
#     print("\n")

In [None]:
# '''
# Check which topic our test document belongs to using the LDA TF-IDF model.
# '''
# # Our test document is document number 4310
# for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
# lda_model.print_topics()

### Unseen Document Topic Identification 

In [44]:
df1 = pd.DataFrame(columns=['news'])
df1

Unnamed: 0,news


In [45]:
'''
Unseen Document
'''
# Artha
# df1.loc[0] = ['''
# 	सहकारीको घरजग्गा लगानी रोक्नुपर्छ अर्थमन्त्रीसहकारी मार्फत घरजग्गा क्षेत्रमा भएको लगानीमा जोखिम देखिन थालेकोले त्यसलाई रोक्नुपर्ने अर्थमन्त्री वर्षमान पुनले बताएका छन्  राष्ट्र बैंकको निरीक्षणमा गएका बेला पुनले घरजग्गा कारोबारी बैंकबाट सरेर सहकारी पुगेको भन्दै कारोबार रोक्नुपर्ने बताएका थिए  "
# 	सहकारी मार्फत घरजग्गा क्षेत्रमा भएको लगानीमा जोखिम देखिन थालेकोले त्यसलाई रोक्नुपर्ने अर्थमन्त्री वर्षमान पुनले बताएका छन्  राष्ट्र बैंकको निरीक्षणमा गएका बेला पुनले घरजग्गा कारोबारी बैंकबाट सरेर सहकारी पुगेको भन्दै कारोबार रोक्नुपर्ने बताएका थिए   
# 	राष्ट्र बैंकले नियमन गर्ने संस्थाहरू वाणिज्य बैंक विकास बैंक र फाइनान्स कम्पनीलाई घरजग्गा क्षेत्रमा लगानी गर्न पाउने सीमा तोकेको छ  यी संस्थाबाट ऋण लिन गाह्रो भएपछि घरजग्गा कारोबारी सहकारीतर्फ आकषिर्त भएका हुन्  फितलो नियमनका कारण पनि सहकारीबाट घरजग्गामा लगानी गर्न सजिलो रहेको विशेषज्ञ बताउँछन्   
# 	सहकारी विभागले एक वर्षअघि गरेको अध्ययनमा सहकारीको घरजग्गामा ठूलो लगानी देखिएको थियो   करोड रुपैयाँभन्दा बढी कारोबार गर्ने  सहकारीमा गरिएको उक्त अध्ययनले कुल लगानीको  प्रतिशतसम्म घरजग्गामा गएको देखिएको थियो  त्यसपछि विभागले समय तोकेर यस्तो ऋण घटाउन निर्देशन दिएको उपसचिव विष्णुप्रसाद घिमिरेले बताए  राष्ट्र बैंकको निर्देशन जस्तै तोकिएको समयमा घरजग्गा क्षेत्रको लगानी घटाउन निर्देशन दिएका थियौं उनले भने  तर यस्तो ऋण घटेनघटेकोबारे पछिल्लो जानकारी विभागसँग छैन   
# 	बलियो कानुन र कर्मचारी अभाव रहेको विभागले घरजग्गामा भएको लगानीबारे कुनै अनुसन्धान गर्न सकेको छैन   
# 	पहिलेको तुलनामा घरजग्गा क्षेत्रमा हुने लगानी केही घटेको तर यकिन तथ्यांक नरहेको घिमिरेले बताए   
# 	गत आर्थिक वर्षमा सहकारीको कुल कारोबार  खर्ब  अर्ब रुपैयाँ पुगेको छ  यो वित्तीय क्षेत्रको  प्रतिशत हुन्छ   
# 	विभागको तथ्यांक अनुसार गत आर्थिक वर्षमा  अर्ब  करोड सेयर पुँजी रहेका सहकारीले  खर्ब  अर्ब निक्षेप संकलन गरेर  खर्ब  अर्ब लगानी गरेका थिए   
# 	मुलुकभर  हजार  सय  सहकारी छन्  जसमा  हजार  सय  वटा बचत तथा ऋण सहकारी छन्  यस्ता सहकारीले सबैभन्दा बढी निक्षेप संकलन र लगानी गरेका छन्  यी संस्था बढी व्यापारमुखी रहेकाले घरजग्गा लगायतका क्षेत्रमा बढी लगानी गरेका छन्   
# 	यसबाहेक मुलुकमा  हजार  सय  बहुउद्देश्यीय सहकारी  हजार  सय  कृषि सहकारी  हजार  सय  दुग्ध उत्पादक सहकारी र  हजार  सय  उपभोक्ता सहकारी छन्   
# 	सहकारीको संख्या बर्सेनि बढ्दै गएपछि नियमन गर्ने विभागमा भने कर्मचारी पर्याप्त छैनन्  पछिल्लो समय विभागमा  सय  कर्मचारी कार्यरत छन्   
# 	''' ]

# Rajniti
# df1.loc[0] = ['''
# नेपाली कांग्रेसले कोशी प्रदेशमा विकसित भएको राजनीतिक घटनाक्रमलाई अप्रत्याशित र आकस्मिक भनेको छ।
# प्रचार विभाग प्रमुख मीनबहादुर विश्वकर्माले अप्रत्याशित र आकस्मिक घटना भएको बताए।
# कोशीमा सत्ता गठबन्धनले नेकपा माओवादी केन्द्रका इन्द्र आङ्बोलाई मुख्यमन्त्री बनाउने बिहीबार बालुवाटारमा छलफल गरेर सहमति गरेको थियो।
# तर नेता शेखर कोइरालानिकट कोशीका सांसदले मुख्यमन्त्री माओवादीलाई छाड्न चाहेनन्। कोइरालानिकट कोशी कांग्रेस सांसद केदार कार्कीले एमालेका ३९ र कांग्रेसका ८ गरी कूल ४७ जना सांसद (बहुमत) को हस्ताक्षर पेश गर्दै प्रदेश प्रमुखकहाँ मुख्यमन्त्रीमा दाबी पेश गरेका छन्।
# कोशीमा कांग्रेसका २९ जना सांसद छन्।
# कांग्रेस, माओवादी, एकीकृत समाजवादी र जसपा संसदीय दल नेताको हस्ताक्षरसहित माओवादीका आङ्बोले पनि मुख्यमन्त्रीमा दाबी गरेका छन्।
# कोशी प्रदेश प्रमुख परशुराम खापुङ्गले संविधानको धारा १६८ को उपधारा ५ अनुसार असोज २६ गते अपराह्न ५ बजेभित्र मुख्यमन्त्रीमा दाबी पेश गर्न समय दिएका थिए। 
# गत असोज २० गते हिक्मत कार्कीले मुख्यमन्त्री पदबाट राजीनामा दिएका थिए।
# कांग्रेससहित केन्द्रमा सरकार बनाएको सत्ता गठबन्धनका दलका शीर्ष नेताहरूले कोशीमा माओवादीको मुख्यमन्त्री बनाउन सहमति गरे पनि कांग्रेस कोशीका केही सांसदहरूले यो निर्णय अस्वीकार गरेका हुन्।
# कोशीमा विकसित राजनीतिक घटनाक्रम अप्रत्याशित भएको कांग्रेस प्रचार विभाग प्रमुख विश्वकर्माले बताए। ‘पार्टीका सबै नेताहरूसँग छलफल गरेर नै निर्णयमा पुगिएको थियो तर आकस्मिक र अप्रत्याशित घटना भएको छ। यसबारे पार्टीले बैठक नै गरेर धारणा बनाउनुपर्छ,’ उनले भने।
# ''' ]

# Khelkud
df1.loc[0] = ['''
फिफा विश्वकप २०२६ अन्तर्गत एसिया छनोटको पहिलो चरणको खेलमा नेपाल र लाओसले एक-एक गोलको बराबरी खेलेका छन्।
दशरथ रंगशालामा बिहीबार भएको खेलमा नेपालले एक गोलले पछि परेको स्थिति उल्टाएको थियो।
खेलमा लाओसका कप्तान बुङकोङले ३३ मिनेटमा गोल गर्दा घरेलु दर्शक स्तब्ध बनेको थियो।
तर दोस्रो हाफमा नेपालका लागि अञ्जन बिष्टले बराबरी गोल गर्दै नेपालको हार टारे।
उनले बराबरी गोलसँगै नेपालका लागि सर्वाधिक अन्तर्राष्ट्रिय गोलको कीर्तिमान बराबरी गरेका छन्। अञ्जनले खेलको ४९ मिनेटमा गोल गरेका थिए।
अञ्जनले ९ वर्ष राष्ट्रिय टिमबाट खेल्दा १३ गोल गरेका हुन्। उनले हरि खड्का र नीराजन रायमाझीको १३ गोलको कीर्तिमान बराबरी गर्न सफल भए। अब नेपाल र लाओसबीचको दोस्रो लेगको खेल १७ अक्टोबरमा लाओसमा हुनेछ।
''' ]

df1

Unnamed: 0,news
0,\nफिफा विश्वकप २०२६ अन्तर्गत एसिया छनोटको पहिल...


In [46]:
'''
Preparing the Bag of Words
'''
processed_new_data = string_manipulation(df1)

In [47]:
processed_new_data["news"] = processed_new_data["news"].apply(tokenize.tokenizer)

In [48]:
processed_new_data['news'] = processed_new_data['news'].apply(lambda x : get_stem(x))
# processed_new_data.news.to_list()

In [49]:
processed_new_data['news'] = processed_new_data['news'].apply(lambda x : clean_data(x))
for l in processed_new_data.news.to_list():
    print(len(l))

69


In [50]:
'''
Finds the topic and corresponding score for the unseen document 
'''
# Data preprocessing step for the unseen document
list_of_string = processed_new_data.news.to_list()[0] 
bow_vector = id2word.doc2bow(list_of_string)


for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3226774036884308	 Topic: 0.014*"महोत्सव" + 0.014*"प्रतियोगिता" + 0.013*"गुरुङ" + 0.013*"मिस" + 0.012*"राई"
Score: 0.20623783767223358	 Topic: 0.007*"राजनीति" + 0.007*"पहिचान" + 0.006*"जातीय" + 0.006*"प्रदेश" + 0.005*"संघीयता"
Score: 0.14930056035518646	 Topic: 0.048*"गीत" + 0.023*"संगीत" + 0.016*"एल्बम" + 0.013*"गायक" + 0.007*"म्युजिक"
Score: 0.1352580189704895	 Topic: 0.009*"मानिस" + 0.008*"उहाँ" + 0.005*"लेख" + 0.005*"तपाईं" + 0.005*"आँखा"
Score: 0.07550681382417679	 Topic: 0.055*"अस्पताल" + 0.044*"उपचार" + 0.035*"बिरामी" + 0.034*"स्वास्थ्य" + 0.030*"औषधि"
Score: 0.062440622597932816	 Topic: 0.023*"खेलाडी" + 0.022*"टिम" + 0.019*"गोल" + 0.017*"जित" + 0.012*"प्रतियोगिता"
Score: 0.03287834674119949	 Topic: 0.024*"लगानी" + 0.020*"सम्झौता" + 0.018*"मन्त्रालय" + 0.010*"सडक" + 0.010*"ऊर्जा"


### Document Similarity - JensenShanon Distance

In [51]:
import numpy as np

In [52]:
'''
Obtaining the topic distribution of every document
'''
doc_topic_dist = []
for bow in bow_corpus:
    topics_list = lda_model.get_document_topics(bow,minimum_probability=0)
    # print(topics_list)
    row = []
    for idx, score in topics_list:
        row.append(score)

    doc_topic_dist.append(row)

print(len(doc_topic_dist))

10000


In [53]:
doc_distribution = np.array(doc_topic_dist)
doc_distribution.shape

(10000, 25)

In [54]:
new_dist = []
for idx,score in lda_model.get_document_topics(bow_vector, minimum_probability=0):
    new_dist.append(score)

new_doc_distribution = np.array(new_dist)
new_doc_distribution.shape
# new_doc_distribution = np.array([tup[1] for tup in lda_model[bow_vector]])
# new_doc_distribution.shape

(25,)

In [55]:
from scipy.stats import entropy
from scipy.spatial import distance

In [57]:
# def jensen_shannon(query, matrix):
#     """
#     This function implements a Jensen-Shannon similarity
#     between the input query (an LDA topic distribution for a document)
#     and the entire corpus of topic distributions.
#     It returns an array of length M where M is the number of documents in the corpus
#     """
#     # lets keep with the p,q notation above
#     p = query[None,:].T # take transpose
#     q = matrix.T # transpose matrix
#     m = 0.5*(p + q)
#     return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    """
    sim = [distance.jensenshannon(data,query) for data in matrix]
    return np.array(sim)

In [58]:
def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

In [59]:
most_sim_ids = get_most_similar_documents(new_doc_distribution, doc_distribution)
# print(most_sim_ids)
for ids in most_sim_ids:
    print(file_paths[ids])

./newscorpus2/Khelkud/Khelkud/218.txt
./newscorpus2/ArthaBanijya/ArthaBanijya/900.txt
./newscorpus2/Manoranjan/Manoranjan/670.txt
./newscorpus2/Viswa/Viswa/553.txt
./newscorpus2/Prabas/Prabas/120.txt
./newscorpus2/Khelkud/Khelkud/820.txt
./newscorpus2/Desh/Desh/177.txt
./newscorpus2/Khelkud/Khelkud/598.txt
./newscorpus2/Manoranjan/Manoranjan/964.txt
./newscorpus2/Desh/Desh/609.txt
