In [1]:
import json
import os
import sys
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score
from tqdm import tqdm

In [2]:
AgNews = pd.read_csv(
    "../data/CharCnn_Keras-master/data/ag_news_csv/train.csv",
    header=None,
)

In [3]:
AgNews.columns = ["class", "title", "description"]

In [4]:
with open("../data/CharCnn_Keras-master/data/ag_news_csv/classes.txt", mode="r") as f:
    classes = f.read().replace('/', '').split("\n")

In [5]:
classes

['World', 'Sports', 'Business', 'SciTech']

In [6]:
documents = [TaggedDocument(doc.split(' '), [i]) for i, doc in enumerate(AgNews.description)]

In [7]:
vector_sizes = [2, 3, 4, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640]

In [8]:
def get_doc2vec(texts, vector_size, path):
    model = Doc2Vec(
        documents,
        vector_size=vector_size,
        window=5,
        min_count=1,
        epochs=50,
        workers=os.cpu_count(),
    )
    model.save(path)
    vectors = [model.infer_vector(text.split(" ")) for text in texts]
    return vectors

In [9]:
def get_kmeans(vectors, n_clusters):
    mat = np.stack(vectors)
    kmeans = KMeans(n_clusters=n_clusters)
    pred = kmeans.fit_predict(mat)
    return pred

In [10]:
vector_path = "./data/vector/"
model_path = "./data/model/"
for vector_size in tqdm(vector_sizes):
    vectors = get_doc2vec(
        AgNews.description, vector_size, path=f"{model_path}doc2vec/{vector_size}.csv"
    )
    np.savetxt(
        f"{vector_path}{vector_size}.csv",
        np.stack(vectors),
        delimiter=",",
    )

100%|██████████| 14/14 [25:41:02<00:00, 6604.43s/it]  


In [None]:
vectors = dict()
for vector_size in vector_sizes:
    model = Doc2Vec(
        documents,
        vector_size=vector_size,
        window=5,
        min_count=1,
        workers=os.cpu_count(),
    )
    vector = dict()
    for idx, text in tqdm(enumerate(AgNews.description)):
        vector[idx] = model.infer_vector(text.split(" "))
    vectors[vector_size] = vector

In [None]:
preds = dict()
for vector_size in tqdm(vector_sizes):
    mat = np.vstack(vectors[vector_size].values())
    kmeans = KMeans(n_clusters=4)
    pred = kmeans.fit_predict(mat)
    preds[vector_size] = pred

In [11]:
vectorPath = "./data/vector/"
for vector_size in tqdm(vector_sizes):
    np.savetxt(f"{vectorPath}{vector_size}.csv", np.stack(vectors[vector_size].values()), delimiter=",")

  if await self.run_code(code, result, async_=asy):
100%|██████████| 6/6 [00:56<00:00,  9.38s/it]


In [12]:
label = AgNews["class"].to_numpy()

In [13]:
MI = dict()
for dim, pred in preds.items():
    MI[dim] = adjusted_mutual_info_score(label, pred)

In [14]:
MI = pd.DataFrame.from_dict(MI, orient="index")
MI.columns=["MI"]

In [15]:
MI.to_csv('./data/MI.csv')