In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk

In [3]:
import json
import os

In [22]:
def readData(path):
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))

    all_data_json_list = []
    for f in files:
        with open(f, encoding = 'utf-8') as json_file:
            sdata = json.load(json_file)
            for data in sdata:
                all_data_json_list.append(data)
    print('Total data read -> ' + str(len(all_data_json_list)))
    return all_data_json_list

In [85]:
def convertToDataFrame(data_json):
    df = pd.DataFrame(data_json)
    print('DataFrame shape' + str(df.shape))
    return df

In [13]:
def convertToList(data_json_list, key):
    data_list = []
    for single_data_json in data_json_list:
        data_list.append(single_data_json[key])
    return data_list

In [14]:
def valid_bengali_letters(char):
    return ord(char) >= 2433 and ord(char) <= 2543 

def get_replacement(char):
    if valid_bengali_letters(char):
        return char
    newlines = [10, 2404, 2405, 2551, 9576]
    if ord(char) in newlines: 
        return ' '
    return ' ';

def get_valid_lines(line):
    copy_line = ''
    for letter in line:
        copy_line += get_replacement(letter)
    return copy_line

def sent_to_words(sentences):
    for sentence in sentences:
        yield(nltk.word_tokenize(get_valid_lines(sentence)))  # deacc=True removes punctuations


In [53]:
stopwords_file = open('stop_words.txt', "r+", encoding = 'utf-8')
all_stopwords = stopwords_file.read()
stopwords_ready = [word.strip() for word in all_stopwords.split()]
def remove_stopwords(content):   
    without_stopwords = []
    for word in content:
        if word not in stopwords_ready and len(word) > 5:
            without_stopwords.append(word)
    return without_stopwords
def remove_stopwords_list(data_list):
    data_without_stopwords_list = []
    for content in data_list:
        data_without_stopwords_list.append(remove_stopwords(content))
    return data_without_stopwords_list

In [54]:
from b_parser import RafiStemmer
stemmer = RafiStemmer()
def stemming_data(content):
    for i, word in enumerate(content):
        content[i] = stemmer.stem_word(word)
    return content
def stemming_data_list(data_list):
    for i, sdata in enumerate(data_list):
        data_list[i] = stemming_data(sdata)
    return data_list

In [73]:
def runLda(data_ready, num_topics = 10, iterations = 1000, alpha='auto'):
    # Create Dictionary
    id2word = corpora.Dictionary(data_ready)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_ready]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=30,
                                               passes=30,
                                               alpha=alpha,
                                               iterations=iterations,
                                               per_word_topics=True)
    return lda_model

In [90]:
def ldaOutputProducer(lda_model):
    x = (lda_model.show_topics(num_topics=20, num_words=40,formatted=False))
    topics_words = [(tp[0], [wd[0] for wd in tp[1]], [wd[1] for wd in tp[1]]) for tp in x]
    output_json_list = []
    for topic,words,conts in topics_words:
        topic_json = {}
        topic_content = {}
        topic_content["words"] = words
        topic_content["conts"] = conts
        #topic_json[str(topic)] = topic_content
        output_json_list.append(topic_content)
    out_df = convertToDataFrame(output_json_list)
    out_df.to_json(r'topic_dist.txt')
    #write = open('topic_output.txt', 'w+', encoding='utf-8')
    #json.dump(out, write, indent=2, ensure_ascii=False)
    #write.close()

In [57]:
data_json_list = readData('/home/aljubaer/Desktop/1_spl_3/data/newspaper')

Total data read -> 812


In [58]:
data_df = convertToDataFrame(data_json_list)

DataFrame shape(812, 3)


In [59]:
data_list = convertToList(data_json_list, 'content')

In [60]:
data_tokenized_list = list(sent_to_words(data_list))

In [69]:
data_without_stopwords_list = remove_stopwords_list(data_tokenized_list)

In [70]:
data_stemmed_list = stemming_data_list(data_without_stopwords_list)

In [74]:
lda_model = runLda(data_stemmed_list, num_topics = 10, iterations = 1000)

In [91]:
ldaOutputProducer(lda_model)

DataFrame shape(10, 2)
