# Import

In [30]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from smart_open import open
from tqdm import tqdm

## Add configuration file

In [31]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [32]:
from ALL import config 
from util import *

## Set condition

In [33]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [34]:
s3 = S3Manager()

In [35]:
data_type = "TweetTopic"

In [36]:
model_nums = config["clustering"]["LDA"]["max_model_num"]

In [37]:
n_components = config["data"][data_type_classifier(data_type)]["class_num"]

# Read data

In [38]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [39]:
df = pd.read_csv(df_path[0], index_col=0)

In [40]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [41]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

# LDA

In [42]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)
        self.dictionary.filter_extremes()

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)
                
    def __len__(self):
        return len(self.texts)

In [43]:
texts = df.words_nonstop.progress_apply(lambda x: x.split(' ') if x is not np.nan else [""]).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
dictionary.filter_extremes()

100%|██████████| 6997/6997 [00:00<00:00, 281924.98it/s]


In [44]:
os.makedirs(os.path.dirname(f"{root_path_temporary}{data_type}/LDA/"), exist_ok=True)
pickle.dump(dictionary, open(f"{root_path_temporary}{data_type}/LDA/dictionary.sav", "wb"))
pickle.dump(corpus, open(f"{root_path_temporary}{data_type}/LDA/corpus.sav", "wb"))

In [45]:
label = df["class"].to_numpy()

In [46]:
def getLDA(corpus,dictionary, n_components, seed, path):
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=n_components,
        alpha="auto",
        eval_every=5,
        random_state=seed,
    )
    # save model
    os.makedirs(os.path.dirname(path), exist_ok=True)
    lda.save(path)
    pred = [lda[docBow] for docBow in corpus]
    return pred, lda

In [47]:
models_path = f"{root_path_temporary}Clustering/{data_type}/LDA/model/"
pred_path = f"{root_path_temporary}Clustering/{data_type}/LDA/pred/"
prob_path = f"{root_path_temporary}Clustering/{data_type}/LDA/prob/"

for model_num in tqdm(range(model_nums)):
    prob, lda = getLDA(
        corpus=corpus,
        dictionary=dictionary,
        n_components=n_components[0],
        seed=model_num,
        path=f"{models_path}{model_num}/{model_num}"
    )
#     save prediction
    probDf = pd.DataFrame([dict(row) for row in prob]).fillna(0)
    
    os.makedirs(f"{prob_path}", exist_ok=True)
    probDf.to_csv(f"{prob_path}{model_num}.csv")
    
    pred = probDf.idxmax(axis=1).to_numpy()
    os.makedirs(f"{pred_path}", exist_ok=True)
    np.save(f"{pred_path}{model_num}.npy", pred)

100%|██████████| 20/20 [01:35<00:00,  4.80s/it]


## upload file

In [48]:
s3.upload(
    f"{root_path_temporary}Clustering/{data_type}/LDA", 
)

'/home/jovyan/temporary/Clustering/TweetTopic/LDA'

In [49]:
s3.delete_local_all()

/home/jovyan/temporary/Clustering/TweetTopic/LDA
/home/jovyan/temporary/Preprocessing/TweetTopic/master.csv
/home/jovyan/temporary/Preprocessing/TweetTopic/class.csv


In [50]:
send_line_notify(f"LDA.py {data_type} LDA")