In [None]:
!pip install datasketch 
!pip install kshingle 
!pip install pandas 
!pip install demoji
!pip install nltk

In [None]:
from datasketch import MinHash, MinHashLSH
import kshingle as ks
import numpy as np
import pandas as pd
import demoji
demoji.download_codes()
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
df_tweets = pd.read_csv('tweets_2022_abril_junio.csv')
df_tweets.shape

In [None]:
df_tweets = df_tweets.drop_duplicates(subset='id')
df_tweets.shape

In [None]:
df_tweets = df_tweets[df_tweets['text'].apply(lambda x: "RT @" not in x)].reset_index(drop=True)
df_tweets.shape

In [None]:
stopwords_es = set(stopwords.words('spanish'))

In [None]:
def remove_emojis(texto):
    # Expresión regular para eliminar emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticonos
                               u"\U0001F300-\U0001F5FF"  # símbolos y pictogramas
                               u"\U0001F680-\U0001F6FF"  # transporte y símbolos de mapa
                               u"\U0001F1E0-\U0001F1FF"  # banderas de países
                               u"\U00002500-\U00002BEF"  # caracteres chinos, japoneses y coreanos (CJK)
                               u"\U00002702-\U000027B0"  # símbolos de negocio
                               u"\U00002702-\U000027B0"  # flechas y símbolos diversos
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    # Eliminar emojis
    texto_without_emojis = emoji_pattern.sub(r'', texto)
    
    # Preservar letras con tildes, la letra "ñ" y símbolos específicos
    texto_final = re.sub(r'[^a-zA-Zá-úÁ-ÚñÑ@/: ,.;-_]+', '', texto_without_emojis)
    
    return texto_final

In [None]:
def eliminar_stopwords_efficient(texto):
    tokens = word_tokenize(texto, language='spanish')
    texto_sin_stopwords = ' '.join([palabra for palabra in tokens if palabra.lower() not in stopwords_es]).lower()
    texto_sin_emojis_y_stopwords = remove_emojis(texto_sin_stopwords)
    return texto_sin_emojis_y_stopwords

In [None]:
df_tweets['text'] = df_tweets['text'].apply(eliminar_stopwords_efficient)

In [None]:
# Se eliminan los text que no tienen texto
df_tweets = df_tweets.dropna(subset=['text'])
df_tweets.shape

In [None]:
# Crear el shingle de cada tweet
df_tweets['shingles'] = df_tweets['text'].apply(lambda x: ks.shingleset_list(x, [4]))
df_tweets

In [None]:
# Crear un diccionario con key = id y value = shingles
dict_shingles = {}
for i in df_tweets.index:
    id_tweet = df_tweets["id"][i]
    shingles = df_tweets["shingles"][i]
    dict_shingles[id_tweet] = shingles

In [None]:
# Se crea una lista de los usuarios
ids_tweet = list(dict_shingles.keys())
len(ids_tweet)

In [None]:
dict_minhash = {}
for id_tweet in ids_tweet:
    shingles = dict_shingles[id_tweet]
    m = MinHash(num_perm=128)
    for d in shingles:
        m.update(d.encode('utf8'))
    dict_minhash[id_tweet] = m

In [None]:
# Ahora vemos el MinHashLSH
lsh = MinHashLSH(threshold=0.6, num_perm=128)
for id_tweet in ids_tweet:
    lsh.insert(id_tweet, dict_minhash[id_tweet])

In [None]:
# Se crea una función para buscar los tweets similares
def buscar_tweets_similares(id_tweet, df_tweets, lsh, dict_minhash):
    # Se obtiene el MinHash del tweet
    m = dict_minhash[id_tweet]
    
    # Se obtienen los ids de los tweets similares
    ids_similares = lsh.query(m)
    
    # Se obtienen los tweets similares
    tweets_similares = df_tweets.loc[ids_similares]
    
    return tweets_similares

In [None]:
df_tweets_indexado = df_tweets.set_index('id')

In [None]:
# Buscaremos quien escribe similar a un usuario en específico, para esto filtramos el dataframe por el usuario
user_name = 'MacaSimplemente'
df_user = df_tweets_indexado[df_tweets_indexado['screen_name'] == user_name]
df_user.head()

In [None]:
# iteramos sobre los tweets del usuario y buscamos los tweets similares, para esto usamos la función buscar_tweets_similares, guardamos las repeticiones en un diccionario de los usuarios similares, para saber quien es el usuario que más se repite
dict_similares = {}
for id_tweet in df_user.index:
    tweets_similares = buscar_tweets_similares(id_tweet, df_tweets_indexado, lsh, dict_minhash)
    for id_similar in tweets_similares.index:
        if id_similar != id_tweet:
            screen_name = tweets_similares['screen_name'][id_similar]
            if screen_name in dict_similares:
                dict_similares[screen_name] += 1
            else:
                dict_similares[screen_name] = 1


In [None]:
# Obtener los 3 usuarios más similares
sorted(dict_similares.items(), key=lambda x: x[1], reverse=True)[:3]
