In [None]:
import pandas as pd
import numpy as np
import pickle
from bnlp import NLTKTokenizer
from bnlp.corpus import stopwords, punctuations
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import warnings
warnings.filterwarnings('ignore')

In [None]:
# READING DATASET
df = pd.read_csv("train.csv")

In [None]:
# VISUALISING DATASET
pd.set_option("display.max_colwidth",-1)
df.drop(columns=["title","label"],inplace=True)
df.head()

In [None]:
# REMOVING EMPTY ARTICLES
df.dropna(inplace=True)
df.shape

In [None]:
def todict(topics):
  dict = {}
  for j in range(len(topics)):
    dict[topics[j][0]] = topics[j][1]
  return dict

In [None]:
def topic_alloc(dic,num):
    for i in range(0,num):
        if i not in dic.keys():
            dic[i] = -1
    return dic

In [None]:
def foreign_removal(word):
  li = []
  num = ['১','২','৩','৪','৫','৬','৭','৮','৯','০']
  for letter in word:
    if 2433<=ord(letter)<=2554 and letter not in num:
      li.append(letter)
  return "".join(li)

In [None]:
tokenizer = NLTKTokenizer()
def preprocess(text):
  tokens = tokenizer.word_tokenize(text)
  token_list = []
  rem = ["এক","হয়","হয়ে","হয়েছে","দিয়ে","একটা","যায়"]
  for token in tokens:
    if len(foreign_removal(token))>0:
      token_list.append(foreign_removal(token))
  words = [word for word in token_list if word not in stopwords and word not in punctuations and word not in rem]
  return words

In [None]:
# APPLYINGING PREPROCESSING ON DATASET
tokenized_text = df["article"].apply(preprocess)

bigram_phrases = Phrases(tokenized_text,min_count = 5,threshold = 50)

bigram = Phraser(bigram_phrases)

def make_bigram(text):
    return [bigram[word] for word in text]

tokenize_text = make_bigram(tokenized_text)

tokenize_text

In [None]:
# LDA MODEL TRAINING & FINDING BEST NUMBER OF TOPICS FOR THE DATASET
scores = []
model_list = []
dictionary = corpora.Dictionary(tokenize_text)
dtm = [dictionary.doc2bow(word) for word in tokenize_text]
for i in range(5,21):
  LDA_model = LdaModel(corpus=dtm,num_topics=i,id2word=dictionary,random_state=50)
  coherence_model = CoherenceModel(model=LDA_model,texts=tokenize_text,dictionary=dictionary)
  score = coherence_model.get_coherence()
  scores.append(score)
  model_list.append(LDA_model)
  print(f"COHERENCE SCORE FOR {i} TOPICS = {score}")

iter = np.argmax(scores)
num = iter + 5

lda = model_list[iter]

In [None]:
# USING PYLDAVIS ON THE TRAINED LDA MODEL
gensimvis.prepare(lda,dtm,dictionary)

In [None]:
# ASSIGNING TOPIC TO THE DATASET

topics = list(lda.get_document_topics(dtm))

topic_doc = dict({})
for i in range(num):
    topic_doc[i] = []
    
for topic in topics:
  topic_dist = topic_alloc(todict(topic),num)
  for i in range(num):
    topic_doc[i].append(topic_dist[i])

print(topic_doc.keys())

print(df.columns)
for i in topic_doc.keys():
    df[str(i)] = topic_doc[i]

print(df.columns)
df.head()

In [26]:
# EXPORTING NECESSARY FILES

df.to_csv("bangla_news.csv")

filename = "recom.pkl"
fobj = open(filename,"wb")
pickle.dump(lda,fobj)
fobj.close()

fn = "dtm.pkl"
f = open(fn,"wb")
pickle.dump(list(tokenize_text),f)
f.close()

fd = "dict.pkl"
fdo = open(fd,"wb")
pickle.dump(dictionary,fdo)
fdo.close()

fbi = "bigram.pkl"
fbio = open(fbio,"wb")
pickle.dump(bigram,fbio)
fbio.close()