In [None]:
!python -m pip install -U gensim

In [18]:
# From tutorial https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483 
import pandas as pd
from re import sub
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import multiprocessing
from sklearn.cluster import KMeans

In [None]:

#load file
file = pd.read_json("dataset_instagram-hashtag-scraper_2022-10-14_18-12-08-523.json") 

In [None]:
# clean up captions
# want to remove newlines, exclamation points, emojis (for now), as well as hashtag symbols
def caption_to_word_list(caption):
    caption = str(caption)
    caption = caption.lower()
    caption = sub(r"[^A-Za-z0-9^À-ÿ]", " ", caption)
    caption = sub(r"#", "", caption)
    caption = caption.split()
    return caption

In [None]:
file_cleaned = file
file_cleaned.caption = file.caption.apply(lambda x: caption_to_word_list(x))

In [None]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.caption.str.len()>1]
sent = [row for row in file_model.caption]

In [28]:
file_model.head(n=3)

Unnamed: 0,id,type,shortCode,caption,hashtags,mentions,url,commentsCount,firstComment,latestComments,...,timestamp,childPosts,ownerId,videoViewCount,name,topPostsOnly,profilePicUrl,postsCount,topPosts,latestPosts
0,2948966013950609920,Sidecar,Cjs1PckOcqp,"[avaa, una, vez, más, dejó, su, huella, verde,...","[Avaa, Avaavoluntariado, proexcelencia, uvplv,...",[],https://www.instagram.com/p/Cjs1PckOcqp/,0.0,,[],...,2022-10-14 16:16:05+00:00,[],2176554000.0,,,,,,,
1,2948965002603205632,Image,Cjs1AurOROV,"[tomanota, si, deseas, contactar, con, el, equ...","[TomaNota, canaima, GNU, Linux, soporte, corre...",[],https://www.instagram.com/p/Cjs1AurOROV/,0.0,,[],...,2022-10-14 16:14:04+00:00,[],36399560000.0,,,,,,,
2,2948957681219074560,Sidecar,CjszWMGuPmO,"[posted, tepuyroraima, viva, a, aventura, de, ...","[tepuyroraima, montañistasvenezuela, trekkinga...","[tepuyroraima, montanistasvenezuela, mochileir...",https://www.instagram.com/p/CjszWMGuPmO/,0.0,,[],...,2022-10-14 15:59:31+00:00,[],52029640000.0,,,,,,,


In [None]:
phrases = Phrases(sent, min_count=1, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [11]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
w2v_model.build_vocab(sentences, progress_per=10000)


In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
w2v_model.init_sims(replace=True)

In [13]:
w2v_model.save("word2vec.model")

In [15]:
file_export = file_model.copy()
file_export['old_caption'] = file_export.caption
file_export.old_caption = file_export.old_caption.str.join(' ')
file_export.caption = file_export.caption.apply(lambda x: ' '.join(bigram[x]))

In [17]:
file_export[['caption']].to_csv('cleaned_dataset.csv', index=False)

In [21]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [22]:
# it doesn't really matter how many clusters we form, groups will be formed regardless, and we care about the becoming 
# of these groups in general
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [25]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=4, restrict_vocab=None)

[('viajesvzla_mochileros', 0.9551268219947815),
 ('alertan', 0.927005410194397),
 ('sistema_solar', 0.9221019148826599),
 ('notamos', 0.9203981757164001),
 ('kanaimö_rioorinoco', 0.9193635582923889),
 ('ensayos', 0.9175423383712769),
 ('chapo', 0.9174365997314453),
 ('distintas_partes', 0.9173939824104309),
 ('rinden', 0.9167763590812683),
 ('amor_eterno', 0.9156236052513123),
 ('dañado', 0.9154722094535828),
 ('importante_entender', 0.9119508266448975),
 ('aprender_algo', 0.9105193614959717),
 ('angustia', 0.9094057083129883),
 ('tumeremo_km88', 0.9086214303970337),
 ('ha_dedicado', 0.9082965850830078),
 ('nos_olvidamos', 0.9063467383384705),
 ('ocurrir', 0.9059949517250061),
 ('echaron', 0.9059560894966125),
 ('allá_afuera', 0.9052088260650635)]

In [26]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=4, restrict_vocab=None)

[('fotografodeviaje_fotografoporvenezuela', 0.9865026473999023),
 ('modasostenible_hechoenvenezuela', 0.9860959053039551),
 ('risas_momos', 0.9849964380264282),
 ('os_officelibre', 0.984299898147583),
 ('upcycle_sustainable', 0.9842298626899719),
 ('thermalmechanics_refrigeracion', 0.9841768741607666),
 ('sustainability_suprareciclaje', 0.9836444854736328),
 ('eventos_fotografodelavida', 0.9836214184761047),
 ('modelos_productos', 0.9834763407707214),
 ('composition_geometry', 0.9828994274139404),
 ('sanfelipe_avila', 0.9824132323265076),
 ('chicaihalu_clientaihalu', 0.9823122024536133),
 ('repuestosautomotrices_proveedores', 0.9821250438690186),
 ('miriamvera_tepuyero', 0.9820870757102966),
 ('memesgraciosos_humorlatino', 0.9814759492874146),
 ('adventureisoutthere', 0.9814283847808838),
 ('model_retouch', 0.9813326597213745),
 ('waterpool', 0.981279194355011),
 ('argentina_duelo', 0.9812391400337219),
 ('silopuedesimaginarlopodemosproyectar', 0.9812266826629639)]