# Probar algoritmo de similaridad

In [13]:
# Re-implementing the solution
import polars as pl
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
import string
import spacy

# Stopword and stemer
stemmer = SnowballStemmer("spanish")
nlp = spacy.load("es_core_news_sm")
spanish_stopwords_spacy = spacy.lang.es.stop_words.STOP_WORDS

In [14]:
keywords = [
    'narcotráfico', 'drogas', 'cocaína', 'marihuana', 'heroína', 'anfetaminas', 
    'metanfetaminas', 'éxtasis', 'crimen organizado', 'traficante', 'cartel', 
    'narcos', 'estupefacientes', 'psicotrópicos', 'incautación', 'tráfico de drogas', 
    'dealer', 'mafia', 'pasta base', 'crack', 'opiáceos', 'fentanilo', 'alcaloide', 
    'sintéticas', 'laboratorio clandestino', 'lavado de dinero', 'blanqueo de capitales', 'corrupción',
    'bust', 'operativo', 'narcopiso', 'mula', 'cártel', 'infiltrado', 'aprehensión',
    'narcolaboratorio', 'clandestino', 'narcobloqueo', 'túnel', 'narcoavioneta',
    'confiscación', 'narcosubmarino', 'captura', 'dosis', 'microtráfico', 
    'narcomenudeo', 'narcocorrido', 'decapitación', 'narcocultura', 'narcopolítica',
    'soborno', 'narcoguerra', 'narcoestado', 'narcotienditas', 'plantío',
    'narcotúnel', 'narcobodega', 'narcocampamento', 'narcobanda', 'narcobalacera',
    'secuestro', 'narcosecuestro', 'homicidio', 'masacre', 'ejecución'
]

## Regex


In [15]:
# Define a mapping of accented characters to their unaccented counterparts
accent_mapping = {
    'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
    'ü': 'u', 'ñ': 'n',
    'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
    'Ü': 'U', 'Ñ': 'N'
}

def import_data(data):
    return pl.read_csv(data)

def remove_accents(text):
    """Remove accents from the given text."""
    return ''.join(accent_mapping.get(char, char) for char in text)

def normalize_text(url):
    """Normalize, remove accents, and stem the words in the given text."""
    # Convert to lowercase
    text = url.lower()
    
    # Split URLs into components
    text = re.sub(r'https?://', '', text)  # remove http/https
    text = re.sub(r'[\W_]+', ' ', text)    # replace non-alphanumeric characters with space
    text = re.sub(r'[0-9]', ' ', text)    # replace non-alphanumeric characters with space

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove accents
    text = remove_accents(text)
    
    # Stem, remove stopwords, and split into words
    return [stemmer.stem(word) for word in text.split() if word not in spanish_stopwords_spacy]
    
def evaluate_similarity(urlshort, keywords):
    """Evaluate the similarity between a urlshort and a list of keywords."""
    # Normalize urlshort and keywords
    keywords_string = " ".join(keywords)    

    normalized_urlshort = normalize_text(urlshort)
    normalized_keywords = normalize_text(keywords_string)


    normalized_keywords = [normalize_text(keyword) for keyword in keywords]
    
    # Flatten the list of keywords (since some keywords can be multi-word phrases)
    flat_keywords = [word for sublist in normalized_keywords for word in sublist]
    
    # Find matches
    matches = [word for word in normalized_urlshort if word in flat_keywords]
    
    # Calculate the score
    score = round( len(matches) / len(normalized_urlshort), ndigits=2)
    
    return score, matches

In [16]:
urlshort = 'politica alberto flamarique tras la muerte del droga mazzon el pj se fragmento y no logra salir n1186551'
# Evaluate similarity
score, found_matches = evaluate_similarity(urlshort, keywords)
score, found_matches

(0.09, ['drog'])

In [191]:
urlshort = 'https://www.infobae.com/colombia/2023/08/07/autoridades-colombianas-y-espanolas-lograron-interceptar-una-embarcacion-con-1650-kilos-de-cocaina/'
# Evaluate similarity
score, found_matches = evaluate_similarity(urlshort, keywords)
score, found_matches

(0.08, ['cocain'])

In [192]:
urlshort = "https://www.infobae.com/america/america-latina/2023/08/07/luis-arce-dedico-el-discurso-del-dia-de-la-independencia-boliviana-a-comparar-las-exportaciones-de-droga-a-lo-largo-de-los-anos/"
score, found_matches = evaluate_similarity(urlshort, keywords)
score, found_matches

(0.06, ['drog'])

In [193]:
urlshort = "https://www.infobae.com/colombia/2023/08/06/fiscalia-entrego-95-bienes-provenientes-del-narcotrafico-para-reparar-a-victimas-estan-avaluados-en-mas-de-161000-millones/"
score, found_matches = evaluate_similarity(urlshort, keywords)
score, found_matches

(0.08, ['narcotraf'])

In [194]:
urlshort = 'La suba del dólar trajo remarcaciones y elevó el piso de la inflación para agosto'
score, found_matches = evaluate_similarity(urlshort, keywords)
score

0.0

## Word2vec

In [None]:
# word2vec
# https://github.com/dccuchile/spanish-word-embeddings
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Modelo cardellino
wordvectors_file_vec = '/home/sebacastillo/willow/models/wiki.es.vec' # https://crscardellino.ar/SBWCE/

# De este link obtuvimos la sugerencia de cargar parcialmente el modelo de vectores para optimizar memoria
# https://github.com/dccuchile/spanish-word-embeddings/blob/master/examples/Ejemplo_WordVectors.ipynb
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=100000)

v_apple = wordvectors['droga']
v_mango = wordvectors['cigarrillo']

cosine_similarity([v_apple],[v_mango])


array([[0.4810186]], dtype=float32)

In [195]:
from gensim.models.keyedvectors import KeyedVectors
#from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the word vectors (this should be done once and reused for multiple calls to avoid loading overhead)
wordvectors_file_vec = '/home/sebacastillo/willow/models/wiki.es.vec' # path to your model
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=100000)


def compute_median_similarity(url, topic_word, wordvec):
    """
    Compute the median similarity score between a topic word and all words in a URL.      
    Returns:
    - float: The median similarity score.
    """
    text = url.lower()
    text = re.sub(r'https?://', '', text)  # remove http/https
    text = re.sub(r'[\W_]+', ' ', text)    # replace non-alphanumeric characters with space
    text = re.sub(r'[0-9]', ' ', text)    # replace numeric characters with space    
    url_words = [word for word in text.split() if word not in spanish_stopwords_spacy]
    
    # Compute cosine similarity scores
    scores = []
    for word in url_words:
        if word in wordvec:            
            v_url_word = wordvec[word]
            v_topic = wordvec[topic_word]        
            similarity = wordvec.similarity(word, topic_word)
            scores.append(similarity)   
    
    # Return median score    
    return np.max(scores) if scores else 0.0

In [196]:
# Test the function with the provided example
test_url = "https://example.com/nena-feliz-en-la-plaza-juega-con-su-mamá-que-fuma-marihuana"
test_topic_word = "narcotráfico"
compute_median_similarity(test_url, test_topic_word, wordvectors)

0.4447947

In [197]:
# Test the function with the provided example
test_url = "Narcos en Rosario: una guerra que golpeó a la política y afectó a los gobiernos del socialismo y el peronismo"
test_topic_word = "narcotráfico"
compute_median_similarity(test_url, test_topic_word, wordvectors)

0.69604456

In [198]:
test_url = 'Santa Fe, la única cartel donde el narcotráfico superó la capacidad del traficante'
test_topic_word = "narcotráfico"
compute_median_similarity(test_url, test_topic_word, wordvectors)

1.0

In [199]:
test_url = 'La suba del blanqueo capitales trajo remarcaciones y elevó el piso de la inflación para agosto'
test_topic_word = "narcotráfico"
compute_median_similarity(test_url, test_topic_word, wordvectors)

0.46996608

## Conclusion y pauta a utilizar

Fijaremos como *treshold* < 0.4 para detectar verdaderos positivos.

# Ejemplo implementacion

In [3]:
import newspaper

In [4]:
url = 'https://www.infobae.com/' # 20 segundos build de source. Instanciado a las 10.30

In [5]:
source = newspaper.build(url, language='es')

In [75]:
source.articles[0].url

'https://www.infobae.com/politica/2023/08/08/fuerte-gesto-de-la-cgt-a-5-dias-de-las-paso-movilizara-10000-trabajadores-para-un-acto-en-favor-de-massa/'

In [None]:
for article in source.articles:
        print(article.url)

In [7]:
source_size = source.size()

In [18]:
def evaluate_matches(urlshort, keywords):
    """Evaluate the similarity between a urlshort and a list of keywords."""
    # Normalize urlshort and keywords
    keywords_string = " ".join(keywords)    

    normalized_urlshort = normalize_text(urlshort)
    normalized_keywords = normalize_text(keywords_string)


    normalized_keywords = [normalize_text(keyword) for keyword in keywords]
    
    # Flatten the list of keywords (since some keywords can be multi-word phrases)
    flat_keywords = [word for sublist in normalized_keywords for word in sublist]
    
    # Find matches
    matches = [word for word in normalized_urlshort if word in flat_keywords]
    
    # Calculate the score
    score = round( len(matches) / len(normalized_urlshort), ndigits=2)
    
    return score, matches

In [28]:
urls_matches = []
found_matches = []
for article in source.articles:
    
    url = article.url
    match_score, found_match = evaluate_matches(url, keywords)
    
    if match_score > 0:
        urls_matches.append(url) 
        found_matches.append(found_match)

print(urls_matches)
print(found_matches)


['https://www.infobae.com/economia/2023/08/08/el-conflicto-sin-fin-de-lacteos-vidal-el-juez-quedo-de-rehen-de-la-mafia-sindical-denuncio-su-duena/', 'https://www.infobae.com/deportes/2023/08/08/el-curioso-detalle-de-lionel-messi-en-los-segundos-previos-a-la-ejecucion-del-tiro-libre-frente-a-fc-dallas/', 'https://www.infobae.com/salud/2023/08/08/luces-y-sombras-de-la-nueva-droga-inyectable-para-el-tratamiento-de-la-obesidad-segun-los-expertos/', 'https://www.infobae.com/america/2023/08/08/conmocion-por-un-brutal-femicidio-en-uruguay-la-secuestro-su-ex-pareja-y-la-mato-al-chocar-contra-un-peaje/', 'https://www.infobae.com/deportes/2023/08/08/los-detalles-del-operativo-inedito-para-evitar-nuevos-incidentes-entre-boca-juniors-y-nacional-sin-banderazos-y-con-horario-de-ingreso-exclusivo/', 'https://www.infobae.com/sociedad/2023/08/08/la-primera-conquista-del-desierto-la-operacion-militar-que-condujo-rosas-y-el-insolito-encuentro-con-darwin/', 'https://www.infobae.com/sociedad/2023/08/08/el-

In [51]:
def compute_max_similarity(url, topic_word, wordvec):
    """
    Compute the median similarity score between a topic word and all words in a URL.      
    Returns:
    - float: The median similarity score.
    """
    text = url.lower()
    text = re.sub(r'https?://', '', text)  # remove http/https
    text = re.sub(r'[\W_]+', ' ', text)    # replace non-alphanumeric characters with space
    text = re.sub(r'[0-9]', ' ', text)    # replace numeric characters with space    
    url_words = [word for word in text.split() if word not in spanish_stopwords_spacy]
    
    # Compute cosine similarity scores
    scores = []
    for word in url_words:
        if word in wordvec:            
            v_url_word = wordvec[word]
            v_topic = wordvec[topic_word]        
            similarity = wordvec.similarity(word, topic_word)
            scores.append(similarity)   
    
    # Return 

    return np.max(scores)

In [39]:
from gensim.models.keyedvectors import KeyedVectors
def load_embeddings(path='models/wiki.es.vec', limit=100000):
    """
    Load the word embeddings from the specified path.    
    Args:
    - path (str): Path to the embeddings model.
    - limit (int): Limit the number of word vectors loaded.
    
    Returns:
    - KeyedVectors: Loaded word vectors.
    """
    return KeyedVectors.load_word2vec_format(path, limit=limit)

In [43]:
wordvectors = load_embeddings(path='/home/sebacastillo/willow/models/wiki.es.vec', limit=100000)

In [60]:
import numpy as np
max_sin_scores = []
urls_second_match = []
is_similar = []
for url_match in urls_matches:
    max_sin_score = compute_max_similarity(url_match, 'narcotráfico', wordvectors) 
    max_sin_scores.append(max_sin_score)
    urls_second_match.append(url_match)
    is_similar.append(max_sin_score>0.4) 

df = pl.DataFrame({
    "max_sin_scores": max_sin_scores,
    "urls_second_match": urls_second_match,
    "is_similar": is_similar
}).sort('is_similar')    

In [70]:
df["is_similar"].sum()


4

In [71]:
df.filter(df['is_similar']==1)

max_sin_scores,urls_second_match,is_similar
f64,str,i64
0.539085,"""https://www.in…",1
0.605253,"""https://www.in…",1
0.546821,"""https://www.in…",1
0.609395,"""https://www.in…",1
