# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from smart_open import open
from tqdm import tqdm

  from imp import reload
  from scipy.linalg.special_matrices import triu


## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")

In [3]:
from ALL import config 

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_type="AgNews"

# Read data

In [6]:
df = pd.read_csv(
    f"../../Preprocessing/data/{data_type}/master.csv", index_col=0
)

In [7]:
with open(f"../../Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
model_nums = config["clustering"]["LDA"]["max_model_num"]

In [9]:
n_conmponents=config["data"][data_type]["class_num"]

# LDA

In [10]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)
        self.dictionary.filter_extremes()

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)
                
    def __len__(self):
        return len(self.texts)

In [11]:
texts = df.words_nonstop.progress_apply(lambda x: x.split(' ') if x is not np.nan else [""]).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
dictionary.filter_extremes()

100%|██████████| 120000/120000 [00:00<00:00, 193205.10it/s]


In [12]:
os.makedirs(os.path.dirname(f"../data/{data_type}/LDA/"), exist_ok=True)
pickle.dump(dictionary, open(f"../data/{data_type}/LDA/dictionary.sav", "wb"))
pickle.dump(corpus, open(f"../data/{data_type}/LDA/corpus.sav", "wb"))

In [13]:
label = df["class"].to_numpy()

In [14]:
def getLDA(corpus,dictionary, n_components, seed, path):
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=n_components,
        alpha="auto",
        eval_every=5,
        random_state=seed,
    )
    # save model
    os.makedirs(os.path.dirname(path), exist_ok=True)
    lda.save(path)
    pred = [lda[docBow] for docBow in corpus]
    return pred, lda

In [15]:
models_path = f"../data/{data_type}/LDA/model/"
pred_path = f"../data/{data_type}/LDA/pred/"

for model_num in tqdm(range(model_nums)):
    prob, lda = getLDA(
        corpus=corpus,
        dictionary=dictionary,
        n_components=n_conmponents,
        seed=model_num,
        path=f"{models_path}{model_num}"
    )
#     save prediction
    probDf = pd.DataFrame([dict(row) for row in prob]).fillna(0)
    pred = probDf.idxmax(axis=1).to_numpy()
    os.makedirs(f"{pred_path}", exist_ok=True)
    np.save(f"{pred_path}{model_num}.npy", pred)

100%|██████████| 30/30 [37:30<00:00, 75.00s/it]


In [16]:
pyLDAvis.enable_notebook()

vis = gensimvis.prepare(
      lda, corpus, dictionary, n_jobs = 1, sort_topics = False
      )

pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [17]:
df[df.words_nonstop.str.contains("It")]

Unnamed: 0,class,title,text,words,words_nonstop
174,SciTech,Mozilla Exceptions (mexception),\\For some reason I never released this code.\...,\\For some reason I never released this code.\...,\\For reason never released code.\\I developed...
181,SciTech,What would Baby Jesus Think?,"\\""On Tuesday, Cheney, serving in his role as ...","\\ '' On Tuesday , Cheney , serving in his rol...",\\ Tuesday Cheney serving role president Senat...
454,Sports,Italy's Pennetta Wins Idea Prokom Open (AP),AP - Italy's Flavia Pennetta won the Idea Prok...,AP - Italy 's Flavia Pennetta won the Idea Pro...,AP Italy Flavia Pennetta Idea Prokom Open firs...
622,World,Italy on alert after purported Al-Qaeda ultima...,AFP - Italy was on high alert as a group linke...,AFP - Italy was on high alert as a group linke...,AFP Italy high alert group linked Al-Qaeda rep...
874,SciTech,HP Faces New Realities in a 64-Bit World (Ziff...,Ziff Davis - The company this week will unveil...,Ziff Davis - The company this week will unveil...,Ziff Davis company week unveil programs techno...
...,...,...,...,...,...
119502,Sports,Austrians Sweep Podium in Val Gardena Super G,"Val Gardena, Italy (Friday, December 17, 2004)...","Val Gardena , Italy ( Friday , December 17 , 2...",Val Gardena Italy Friday December 17 2004 Mich...
119684,SciTech,Intel hires HP Itanium processor design team,Intel is to hire HP #39;s 300-member Itanium p...,Intel is to hire HP # 39 ; s 300-member Itaniu...,Intel hire HP 39 300-member Itanium processor ...
119701,Sports,Suns Tops in West,"Using what he learned from Italy's pro league,...",Using what he learned from Italy 's pro league...,Using learned Italy pro league Mike D'Antoni i...
119849,Sports,Miller fourth in Italian super-G race,"VAL GARDENA, Italy - Bode Miller finished four...","VAL GARDENA , Italy - Bode Miller finished fou...",VAL GARDENA Italy Bode Miller finished fourth ...
