# Import

In [1]:
import csv
import os
import random
import sys

import gensim.downloader as api
import numpy as np
import pandas as pd
import umap
from gensim.corpora.dictionary import Dictionary
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

In [4]:
random.seed(0)

## Set condition

In [5]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [6]:
data_type = "20News"

# Read data

In [7]:
df = pd.read_csv(
    f"../data/{data_type}/master.csv", index_col=0
)

# Make Corpus

In [8]:
class Corpus:
    def __init__(self, dictionary, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = dictionary

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)
                
    def __len__(self):
        return len(self.texts)

In [9]:
texts = df.words_nonstop.apply(lambda x: x.lower().split(' ') if x is not np.nan else [""]).tolist()
dictionary = Dictionary(texts)
dictionary.filter_extremes()

# Load Model

In [12]:
# download and save
model = api.load("glove-wiki-gigaword-300")
model.save(make_filepath(f"../data/GLDA/word2vec.model"))

In [10]:
model = KeyedVectors.load(f"../data/GLDA/word2vec.model")

# Tokenize

## Corpus

In [11]:
token2id = {
    _token: _id
    for _token, _id in dictionary.token2id.items()
    if _token in model.key_to_index
}

In [12]:
token2id = {_token: _id for _id, _token in enumerate(token2id.keys())}

In [13]:
tokenized_documents = [
    [str(token2id[word]) for word in text if word in token2id.keys()] for text in texts
]

In [14]:
tokenized_documents = [
    document if len(document) != 0 else str(random.randrange(len(token2id)))
    for document in tokenized_documents
]

In [15]:
tokenized_documents = [" ".join(documents) for documents in tokenized_documents]

In [16]:
with open(make_filepath(f"../data/{data_type}/GLDA/corpus.txt"), "w") as f:
    for document in tokenized_documents:
        print(document, file=f)

## Vocab

In [17]:
id2token = {v:k for k, v in token2id.items()}

In [18]:
with open(make_filepath(f"../data/{data_type}/GLDA/vocab.txt"), "w") as f:
    for word in id2token.values():
        print(word, file=f)

In [19]:
vectorized_vocab = {k: model[v] for k, v in id2token.items()}

In [20]:
vectors = np.stack(vectorized_vocab.values())

  if await self.run_code(code, result, async_=asy):


In [21]:
# reduced_vectors = umap.UMAP(n_components=100, random_state=0).fit_transform(vectors)

In [22]:
reduced_vectors = vectors

In [23]:
reduced_vectors.shape

(26880, 300)

In [24]:
np.savetxt(
    make_filepath(f"../data/{data_type}/GLDA/vectorized_vocab.txt"),
    reduced_vectors,
    delimiter=" ",
    newline="\n",
)