# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from smart_open import open
from tqdm import tqdm

  from imp import reload
  from scipy.linalg.special_matrices import triu


## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")

In [3]:
from ALL import config 

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [5]:
newsgroups_df = pd.read_csv("../../../Preprocessing/data/AgNews/master.csv", index_col=0)

In [6]:
with open("../../../Preprocessing/data/AgNews/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [7]:
model_nums = config["clustering"]["LDA"]["max_model_num"]

In [8]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)
        self.dictionary.filter_extremes()

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)
                
    def __len__(self):
        return len(self.texts)

In [9]:
texts = newsgroups_df.words_nonstop.progress_apply(lambda x: x.split(' ')).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
dictionary.filter_extremes()

100%|██████████| 120000/120000 [00:00<00:00, 176129.32it/s]


In [10]:
os.makedirs(os.path.dirname("../../data/AgNews/LDA/"), exist_ok=True)
pickle.dump(dictionary, open("../../data/AgNews/LDA/dictionary.sav", "wb"))
pickle.dump(corpus, open("../../data/AgNews/LDA/corpus.sav", "wb"))

In [11]:
label = newsgroups_df["class"].to_numpy()

In [12]:
def getLDA(corpus,dictionary, n_components, seed, path):
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=n_components,
        alpha="auto",
        eval_every=5,
        random_state=seed,
    )
    # save model
    os.makedirs(os.path.dirname(path), exist_ok=True)
    lda.save(path)
    pred = [lda[docBow] for docBow in corpus]
    return pred, lda

In [13]:
n_conmponents=4

In [14]:
models_path = "../../data/AgNews/LDA/model/"
pred_path = "../../data/AgNews/LDA/pred/"

for model_num in tqdm(range(model_nums)):
    prob, lda = getLDA(
        corpus=corpus,
        dictionary=dictionary,
        n_components=n_conmponents,
        seed=model_num,
        path=f"{models_path}{model_num}"
    )
#     save prediction
    probDf = pd.DataFrame([dict(row) for row in prob]).fillna(0)
    pred = probDf.idxmax(axis=1).to_numpy()
    os.makedirs(f"{pred_path}", exist_ok=True)
    with open(
        f"{pred_path}{model_num}.csv", "w"
    ) as f:
        writer = csv.writer(f)
        writer.writerow(pred)

100%|██████████| 2/2 [02:25<00:00, 72.92s/it]


In [15]:
pyLDAvis.enable_notebook()

vis = gensimvis.prepare(
      lda, corpus, dictionary, n_jobs = 1, sort_topics = False
      )

pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
