In [1]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors

In [None]:
# Embeddings used: FastText embeddings from SBWC (Spanish Billon Word Corpus), https://github.com/dccuchile/spanish-word-embeddings

In [2]:
model = KeyedVectors.load_word2vec_format("fasttext-sbwc.vec.gz")

In [3]:
import pandas as pd
from re import sub
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import multiprocessing
from sklearn.cluster import KMeans

In [4]:
#load file
file = pd.read_json("dataset_instagram-hashtag-scraper_2022-10-14_18-12-08-523.json") 

In [5]:
# clean up captions
# want to remove newlines, exclamation points, emojis (for now), as well as hashtag symbols
def caption_to_word_list(caption):
    caption = str(caption)
    caption = caption.lower()
    caption = sub(r"[^A-Za-z0-9^À-ÿ]", " ", caption)
    caption = sub(r"#", "", caption)
    caption = caption.split()
    return caption

In [6]:
file_cleaned = file
file_cleaned.caption = file.caption.apply(lambda x: caption_to_word_list(x))

In [7]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.caption.str.len()>1]
sent = [row for row in file_model.caption]

In [8]:
phrases = Phrases(sent, min_count=1, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [9]:
model_2 = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
model_2.build_vocab(sentences, progress_per=10000)


In [12]:
total_examples = model_2.corpus_count
words = list(model.index_to_key)
model_2.build_vocab([words], update=True)


In [21]:
import numpy as np
model_2.wv.vectors_lockf = np.ones(len(model_2.wv))

In [23]:
model_2.wv.intersect_word2vec_format('fasttext-sbwc.vec.gz')

In [27]:
model_2.train(sentences, total_examples=total_examples, epochs=model_2.epochs)

(4610391, 12275800)

In [28]:
model_2.init_sims(replace=True)

  """Entry point for launching an IPython kernel.


In [29]:
model_2.save("word2vec.model")
file_export = file_model.copy()
file_export['old_caption'] = file_export.caption
file_export.old_caption = file_export.old_caption.str.join(' ')
file_export.caption = file_export.caption.apply(lambda x: ' '.join(bigram[x]))
file_export[['caption']].to_csv('cleaned_dataset.csv', index=False)
word_vectors = Word2Vec.load("word2vec.model").wv

In [30]:
# it doesn't really matter how many clusters we form, groups will be formed regardless, and we care about the becoming 
# of these groups in general
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [33]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('manera_decisiva', 0.9201809167861938),
 ('q_reciba', 0.918436586856842),
 ('asertiva_para', 0.9182451963424683),
 ('gobierno_bolivariano', 0.9175695180892944),
 ('asamblea_ven', 0.9174111485481262),
 ('cepas_concientes', 0.9172927141189575),
 ('para_esclarecer', 0.916012704372406),
 ('designó_una', 0.9146816730499268),
 ('ofertascanaima', 0.9144347310066223),
 ('asamblea_general', 0.9136211276054382)]

In [34]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('cambioscanaimaexpress_pagina', 0.9990593194961548),
 ('ultimasnoticiasdepuebla', 0.998956561088562),
 ('messi_cucuta', 0.9985226392745972),
 ('bogota_santiago', 0.9984877705574036),
 ('barranquilla_medellin', 0.9984648823738098),
 ('riodejaneiro_cucuta', 0.9983223676681519),
 ('juzgado_con', 0.9982469081878662),
 ('siempre_grabada', 0.9982369542121887),
 ('friedrich_nietzsche', 0.9981515407562256),
 ('juzgados_con', 0.9981502294540405)]