In [1]:
import multiprocessing
import re
import matplotlib.pyplot as plt
import numpy as np
import os
import gensim
import logging
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
from sklearn.preprocessing import StandardScaler
warnings.simplefilter(action='ignore', category=FutureWarning)
NCORES = multiprocessing.cpu_count()
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.WARNING)


In [2]:


def processing_text(texto):
    texto = texto.lower()
    processed_feature = re.sub(r'\W', ' ', str(texto))
    processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)
    processed_feature = re.sub(' +', ' ', processed_feature)
    return processed_feature.strip().split(" ")


def load_folder(folder_path):
    logging.warning(f"Loading files from {folder_path}")
    files = [f"./{folder_path}/{i}" for i in os.listdir(folder_path)]
    documents = []
    for file in files:
        documents += open(file, encoding="utf-8").readlines()
    for i in range(len(documents)):
        documents[i] = processing_text(documents[i])
    logging.warning(f"Done!")
    return documents


def create_model(documents, all_docs=None, dim=100, filename=None):
    all_docs = all_docs or documents
    logging.warning(f"Creating model")
    model = gensim.models.Word2Vec(
        vector_size=dim,
        window=3,
        min_count=2,
        sg=1,
        negative=10,
        workers=NCORES-2,
        seed=42,
        compute_loss=True)
    model.build_vocab(all_docs)
    model.train(documents, total_examples=len(documents), epochs=100)
    logging.warning(f"Model trained!")
    if filename:
        model.save(filename)
        logging.warning(f"Model saved to {filename}")
    return model


def create_model_from_folder(folder_path, all_docs=None, dim=100):
    try:
        os.mkdir("models")
    except Exception as e:
        pass
    docs = load_folder(folder_path)
    f = f"./models/{folder_path}_{dim}.model"
    return create_model(docs, all_docs=all_docs, filename=f, dim=dim)


def weird_plot(model, words, kind="TSNE", ndim=2):
    pca = TSNE(n_components=ndim, init="pca", random_state=42, perplexity=9)
    if kind == "PCA":
        pca = PCA(n_components=ndim)
    X = []
    colors = []
    labels = []
    for word in words:
        vector = model.wv[word]
        close_words = model.wv.most_similar([word], 50)
        colors.append("red")
        X.append(vector)
        labels.append(word)
        for cw in close_words:
            vector = model.wv[cw[0]]
            colors.append("blue")
            X.append(vector)
            labels.append(cw[0])
    X = np.array(X)
    if kind != "PCA":
        min_div = np.inf
        for i in range(1, len(X)):
            ttssne = TSNE(n_components=ndim, init="pca",
                          random_state=42, perplexity=i)
            ttssne.fit_transform(X)
            div = ttssne.kl_divergence_
            if div < min_div:
                pca = ttssne
                min_div = div

    X_pca = pca.fit_transform(X)
    if ndim == 2:
        fig = plt.figure(figsize=[10, 10])
        plt.scatter(*X_pca.T, color=colors)
        for i, label in enumerate(labels):
            plt.annotate(label, xy=X_pca[i])
        plt.show()
    elif ndim == 3:
        fig = plt.figure(figsize=[10, 10])
        ax = fig.add_subplot(projection="3d")
        ax.scatter(*X_pca.T, color=colors)
        for i, label in enumerate(labels):
            ax.text(*X_pca[i], label, None)
        plt.show()
    return X_pca


In [3]:
FOLDERS = ["DON_JULIO", "CARLITOS", "MR_POZO"]

all_docs = []
for folder in FOLDERS:
    all_docs += load_folder(folder)




In [4]:
models = []
for folder in FOLDERS:
    for dim in [200, 250]:
        model = create_model_from_folder(folder, all_docs=all_docs, dim=dim)
        models.append(model)
        top_n_words = model.wv.index_to_key[:5]
        weird_plot(model, top_n_words)


