# Import

In [1]:
import csv
import os
import sys

import gensim.downloader as api
import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import KeyedVectors
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_type = "20News"

# Read data

In [6]:
df = pd.read_csv(
    f"../data/{data_type}/master.csv", index_col=0
)

In [7]:
with open("../../DataShaping/data/20News/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

# Make Corpus

In [8]:
class Corpus:
    def __init__(self, dictionary, path=None, texts=None):
        self.path = path
        self.texts = texts
        self.dictionary = dictionary

    def __iter__(self):
        if self.path is not None:
            for line in open(self.path):
                # assume there's one document per line, tokens separated by whitespace
                yield self.dictionary.doc2bow(line.lower().split())
        else:
            for line in self.texts:
                yield self.dictionary.doc2bow(line)
                
    def __len__(self):
        return len(self.texts)

In [9]:
texts = df.words_nonstop.apply(lambda x: x.lower().split(' ') if x is not np.nan else [""]).tolist()
dictionary = Dictionary(texts)
dictionary.filter_extremes()
corpus = Corpus(texts=texts, dictionary=dictionary)

# Load Model

In [16]:
# download and save
model = api.load("glove-wiki-gigaword-300")
model.save(make_filepath(f"../data/{data_type}/GLDA/word2vec.model"))

In [10]:
model = KeyedVectors.load(f"../data/{data_type}/GLDA/word2vec.model")

In [11]:
vocab = {v: k for k, v in dictionary.token2id.items() if k in model.key_to_index}

In [12]:
tokenized_documents = []
for document in corpus:
    tokenized_document = []
    for word, count in document:
        tokenized_document.extend([str(word)]* count)
    tokenized_documents.append(" ".join(tokenized_document))

In [13]:
vectorized_vocab = {k: model[v] for k, v in vocab.items()}

In [14]:
vectors = np.stack(vectorized_vocab.values())

  if await self.run_code(code, result, async_=asy):


In [20]:
vectors.shape

(26880, 300)

In [18]:
np.savetxt(
    make_filepath(f"../data/{data_type}/GLDA/vectorized_vocab.txt"),
    vectors,
    delimiter=" ",
    newline="\n",
)

In [16]:
with open(make_filepath(f"../data/{data_type}/GLDA/vocab.txt"), "w") as f:
    for word in vocab.values():
        print(word, file=f)

In [17]:
with open(make_filepath(f"../data/{data_type}/GLDA/corpus.txt"), "w") as f:
    for document in tokenized_documents:
        print(document, file=f)