# Import

In [2]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.mixture import GaussianMixture
from smart_open import open
from stop_words import get_stop_words
from tqdm import tqdm

## Add configuration file

In [3]:
sys.path.append("/home/jovyan/core/config/")

In [4]:
from ALL import config 

## Set condition

In [5]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [6]:
newsgroups_df = pd.read_csv("../../../Preprocessing/data/20News/master.csv", index_col=0)

In [7]:
with open("../../../Preprocessing/data/20News/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
model_nums = config["clustering"]["LDA"]["max_model_num"]

In [9]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)

In [10]:
texts = newsgroups_df.words_nonstop.progress_apply(lambda x: x.split(' ')).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
#     登場頻度80%以上の単語を削除
dictionary.filter_extremes()

100%|██████████| 18770/18770 [00:00<00:00, 61056.86it/s]


In [11]:
label = newsgroups_df["class"].to_numpy()

In [12]:
pred_path = "../../../Clustering/data/20News/LDA/pred/"

coherence = []
for model_num in tqdm(range(model_nums)):

    pred = np.loadtxt(
        f"{pred_path}{model_num}.csv", delimiter=","
    )

    cm = CoherenceModel(
        topics=pred.reshape(1, -1),
        corpus=corpus,
        dictionary=dictionary,
        texts=texts,
        coherence="c_v",
    )
    coherence.append(cm.get_coherence())

100%|██████████| 30/30 [01:06<00:00,  2.22s/it]


In [13]:
coherence_path = f"../../data/20News/LDA/coherence.csv"
os.makedirs(os.path.dirname(coherence_path), exist_ok=True)
pd.DataFrame(coherence, ).to_csv(coherence_path)