# Import

In [10]:
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics.pairwise import cosine_similarity
from numpy import linalg as LA
from tqdm import tqdm

# Functions

In [2]:
def centralize_array(array):
    return array - np.mean(array, axis=0)

In [3]:
def normarize_array(array):
    return array / np.sqrt(np.sum(array * array, axis=1).reshape(-1, 1))

In [4]:
def normarize_vector(vector):
    return vector / np.sqrt(np.sum(vector * vector))

In [5]:
def get_average_vector(vectors):
    sum_vector = np.sum(vectors, axis=0)
    return normarize_vector(sum_vector)

# Read data

In [6]:
ag_news_df = pd.read_csv(
    "../data/CharCnn_Keras-master/data/ag_news_csv/train.csv",
    header=None,
    names=["class", "title", "description"],
)

In [7]:
dim_nums = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640]

In [8]:
class_nums = range(1, 5)

In [9]:
def get_kmeans(vectors, n_clusters):
    mat = np.stack(vectors)
    kmeans = KMeans(n_clusters=n_clusters)
    pred = kmeans.fit_predict(mat)
    return pred

In [13]:
label = ag_news_df["class"].to_numpy()

In [15]:
#cosine sim
MI = dict()
for dim_num in tqdm(dim_nums):
    # read vectors
    text_vectors = np.loadtxt(f"./data/vector/{dim_num}.csv", delimiter=',')
    #preprocessing
    centralized_vectors = centralize_array(text_vectors)
    normarized_vectors = normarize_array(centralized_vectors)
    # get topics by kmeans
    pred = get_kmeans(normarized_vectors, 4)
    # calc mutual information
    MI[dim_num] = adjusted_mutual_info_score(label, pred)

100%|██████████| 15/15 [24:24<00:00, 97.61s/it] 


In [16]:
MI = pd.DataFrame.from_dict(MI, orient="index")
MI.columns=["MI"]

In [17]:
MI.to_csv('./data/MI_centralized.csv')

In [18]:
MI

Unnamed: 0,MI
2,0.192199
3,0.302315
4,0.42437
5,0.208206
6,0.427343
7,0.421215
8,0.419457
9,0.419281
10,0.402986
20,0.202808
