# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from smart_open import open
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")

In [3]:
from ALL import config 

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Functions

In [5]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)

# Read data

In [6]:
master_path = {
    "AgNews": "../../Preprocessing/data/AgNews/master.csv",
    "20News": "../../Preprocessing/data/20News/master.csv",
}

In [7]:
label_path = {
    "AgNews": "../../Preprocessing/data/AgNews/class.csv",
    "20News": "../../Preprocessing/data/20News/class.csv",
}

In [19]:
for df_name, df_path in master_path.items():
    # データ読み込み
    df = pd.read_csv(df_path, index_col=0)
    #     with open(label_path[df_name], mode="r") as f:
    #         reader = csv.reader(f)
    #         class_labels = [label for label in reader]
    label = df["class"].to_numpy()
    # コーパスの作成
    texts = df.words_nonstop.progress_apply(lambda x: x.split(" ")).tolist()
    corpus = Corpus(texts=texts)
    dictionary = Dictionary(texts)
    dictionary.filter_extremes()

    # coherenceの計算
    cm = CoherenceModel(
        topics=label.reshape(1, -1),
        corpus=corpus,
        dictionary=dictionary,
        texts=texts,
        coherence="c_v",
    )
    coherence_path = f"../data/{df_name}/coherence.csv"
    os.makedirs(os.path.dirname(coherence_path), exist_ok=True)
    pd.DataFrame([cm.get_coherence()]).to_csv(coherence_path)

100%|██████████| 120000/120000 [00:00<00:00, 132418.95it/s]
100%|██████████| 18770/18770 [00:00<00:00, 63078.15it/s]
