# Import

In [14]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.mixture import GaussianMixture
from smart_open import open
from stop_words import get_stop_words
from tqdm import tqdm

## Add configuration file

In [15]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [16]:
from ALL import config 
from util import *

## Set condition

In [17]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [None]:
s3 = S3Manager()

In [18]:
data_type="AgNews"#sys.argv[1]
vectorize_type = "sentenceBERT"#sys.argv[2]
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"#sys.argv[3]

In [22]:
vector_dims = config["vectorize"][vectorize_type][transformer_model]["dims"]
normalizations = config["vectorize"][vectorize_type][transformer_model]["normalization"]
vector_model_nums = config["vectorize"][vectorize_type][transformer_model]["max_model_num"]

model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]
topic_nums = config["data"][data_type_classifier(data_type)]["class_num"]

# Read data

In [13]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [14]:
df = pd.read_csv(df_path[0], index_col=0)

In [15]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [16]:
label = df["class"].to_numpy()

In [17]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [None]:
if vectorize_type == "doc2vec":
    pred_path = f"Clustering/{data_type}/{vectorize_type}/vector"
elif vectorize_type == "sentenceBERT":
    pred_path = f"Clustering/{data_type}/{vectorize_type}/{transformer_model}/GMM/pred/"
else:
    raise NotImplementedError

# functions

In [23]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)

# Calculate Coherence

In [24]:
texts = df.words_nonstop.progress_apply(
    lambda x: x.split(" ") if x is not np.nan else [""]
).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
#     登場頻度80%以上の単語を削除
dictionary.filter_extremes()

100%|██████████| 120000/120000 [00:00<00:00, 164420.18it/s]


In [25]:
for vector_dim in tqdm(vector_dims):
    coherences = []
    for model_num in range(model_nums):
        coherence = []
        for covariance_type in covariance_types:
            pred = np.load(
                f"{root_path_temporary}{pred_path}{vector_dim}/{normalization}/{n_component}/{covariance_type}/{model_num}.npy"
            )

            cm = CoherenceModel(
                topics=pred.reshape(1, -1),
                corpus=corpus,
                dictionary=dictionary,
                texts=texts,
                coherence="c_v",
            )
            coherence.append(cm.get_coherence())
        coherences.append(coherence)
        coherence_path = f"../data/{data_type}/{vectorize_type}/GMM/coherence/{normalization}/{vector_dim}.csv"
        pd.DataFrame(coherences, columns=covariance_types).to_csv(make_filepath(coherence_path))

100%|██████████| 1/1 [02:57<00:00, 177.45s/it]


In [26]:
send_line_notify(f"end {data_type} {vectorize_type}")