# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from itertools import product
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.mixture import GaussianMixture
from smart_open import open
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
s3 = S3Manager()

In [6]:
data_type="20NewsSampled1"#sys.argv[1]
vectorize_type = "sentenceBERT"#sys.argv[2]
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"#sys.argv[3]

In [7]:
vector_dims = config["vectorize"][vectorize_type][transformer_model]["dims"]
normalizations = config["vectorize"][vectorize_type][transformer_model]["normalization"]
vector_model_nums = config["vectorize"][vectorize_type][transformer_model]["max_model_num"]

model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]
topic_nums = config["data"][data_type_classifier(data_type)]["class_num"]
depression_type = "umap"

# Read data

In [8]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [9]:
df = pd.read_csv(df_path[0], index_col=0)

In [10]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [11]:
label = df["class"].to_numpy()

In [12]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [13]:
if vectorize_type == "doc2vec":
    pred_path = f"Clustering/{data_type}/{vectorize_type}/GMM/pred/"
elif vectorize_type == "sentenceBERT":
    pred_path = f"Clustering/{data_type}/{vectorize_type}/{transformer_model}/GMM/pred/"
else:
    raise NotImplementedError

In [21]:
s3.download(pred_path)

['/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/0.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/1.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/10.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/11.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/12.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/diag/13.npy',
 '/home/jovyan/temporary/Clustering/20NewsSampled1/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/GMM/pred/128/centralized/10/d

# functions

In [22]:
class Corpus:
    def __init__(self, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = Dictionary(texts)

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)

# Calculate Coherence

In [23]:
texts = df.words_nonstop.progress_apply(
    lambda x: x.split(" ") if x is not np.nan else [""]
).tolist()
corpus = Corpus(texts=texts)
dictionary = Dictionary(texts)
#     登場頻度80%以上の単語を削除
dictionary.filter_extremes()

100%|██████████| 18770/18770 [00:00<00:00, 74601.55it/s]


In [24]:
if vectorize_type == "doc2vec":
    coherence_base_path = (
        f"/home/jovyan/temporary/Postprocessing/{data_type}/{vectorize_type}/GMM/coherence"
    )
if vectorize_type == "sentenceBERT":
    coherence_base_path = f"/home/jovyan/temporary/Postprocessing/{data_type}/{vectorize_type}/{transformer_model}/GMM/coherence"
else:
    raise NotImplementedError

In [26]:
for vector_dim, normalization, topic_num, covariance_type in tqdm(
    product(vector_dims, normalizations, topic_nums, covariance_types)
):
    for model_num in range(model_nums):
        pred = np.load(
            f"{root_path_temporary}{pred_path}{vector_dim}/{normalization}/{topic_num}/{covariance_type}/{model_num}.npy"
        )

        cm = CoherenceModel(
            topics=pred.reshape(1, -1),
            corpus=corpus,
            dictionary=dictionary,
            texts=texts,
            coherence="c_v",
        )
        coherence = cm.get_coherence()
        coherence_path = f"{coherence_base_path}/{vector_dim}/{normalization}/{topic_num}/{covariance_type}/{model_num}.npy"

        np.save(make_filepath(coherence_path), cm.get_coherence())

42it [07:25, 10.61s/it]


## upload file

In [None]:
s3.upload(coherence_base_path)

In [None]:
s3.delete_local_all()

In [None]:
send_line_notify(f"end {data_type} {vectorize_type}")