In [1]:
import polars as pl
import numpy as np
import re
from nltk.stem.snowball import SnowballStemmer
from gensim.models.keyedvectors import KeyedVectors
from bs4 import BeautifulSoup
import string
import spacy

# Stopword and stemer
stemmer = SnowballStemmer("spanish")
nlp = spacy.load("es_core_news_sm")
spanish_stopwords_spacy = spacy.lang.es.stop_words.STOP_WORDS


In [2]:
def load_embeddings(path="/home/sebacastillo/willow/models/wiki.es.vec", limit=100000):
    """
    Load the word embeddings from the specified path.
    Args:
    - path (str): Path to the embeddings model.
    - limit (int): Limit the number of word vectors loaded.

    Returns:
    - KeyedVectors: Loaded word vectors.
    """
    return KeyedVectors.load_word2vec_format(path, limit=limit)

In [3]:
wordvec = load_embeddings()


In [4]:
url = 'https://www.infobae.com/economia/2023/08/14/el-bcra-cocaina-dolar-droga-a-350-fijo-narcotráfico-las-marihuana-y-suba-de-cocina-al-118-por-traficante-consumo-alchool-/'

In [5]:
import re
text = url.lower()
text = re.sub(r"https?://", "", text)  # remove http/https
text = re.sub(
    r"[\W_]+", " ", text
)  # replace non-alphanumeric characters with space
text = re.sub(r"[0-9]", " ", text)  # replace numeric characters with space
url_words = [word for word in text.split() if word not in spanish_stopwords_spacy]


In [6]:
url_words

['www',
 'infobae',
 'com',
 'economia',
 'bcra',
 'cocaina',
 'dolar',
 'droga',
 'fijo',
 'narcotráfico',
 'marihuana',
 'suba',
 'cocina',
 'traficante',
 'consumo',
 'alchool']

In [7]:
topic = 'narcotráfico'
print(topic)

narcotráfico


In [8]:
'narcotráfico' in wordvec

True

In [9]:
for word in url_words:
        if word in wordvec:
                print(word), print(topic)
                similarity = wordvec.similarity(word, topic)
                print(similarity)
   
        


www
narcotráfico
0.12122729
infobae
narcotráfico
0.3483704
com
narcotráfico
0.13672991
economia
narcotráfico
0.24278423
dolar
narcotráfico
0.2998347
droga
narcotráfico
0.60525346
fijo
narcotráfico
0.14581311
narcotráfico
narcotráfico
1.0
marihuana
narcotráfico
0.4447947
suba
narcotráfico
0.20002475
cocina
narcotráfico
0.116768844
traficante
narcotráfico
0.59214777
consumo
narcotráfico
0.30924425


# Similarity with summaries

In [10]:
import polars as pl
df = pl.read_csv('df_clean_contsum.csv')

In [11]:
df.columns

['sumary', 'content_cleaned']

In [14]:
summaries = df['sumary'].to_list()

In [127]:
import re
import numpy as np

In [166]:
topics = ['narcotráfico', 'droga', 'marihuana', 'contrabando', 'traficante']
threshold = 0.6
positivos = 0

for s in summaries:
    print('-' * 70)
    print(s)
    text = s.lower()
    text = re.sub(r"https?://", "", text)  # remove http/https
    text = re.sub(r"[\W_]+", " ", text)  # replace non-alphanumeric characters with space
    text = re.sub(r"[0-9]", " ", text)  # replace numeric characters with space
    summary_w = [word for word in text.split() if word not in spanish_stopwords_spacy]
    
    total_scores = []
    for topic in topics:
        scores = []
        for word in summary_w:
            if word in wordvec:                
                similarity = wordvec.similarity(word, topic)
                scores.append(similarity)
        
        max_score = np.sort(scores)[::-1][:3].max() if scores else 0
        total_scores.append(max_score)

    average_score = sum(total_scores) / len(total_scores) if total_scores else 0
    print(f"Average score: {average_score}")
    if average_score >= threshold:
        print("_____POSITIVO_____")
        positivos += 1
print(positivos)


----------------------------------------------------------------------
El consejo para ahorrar energía y agua al utilizar el lavarropa. La Federación Argentina de Cooperativas de Electricidad y Otros Servicios Públicos (FACE) debe dejar una serie de consejos para alcanzar la eficiencia energética
Average score: 0.2923860490322113
----------------------------------------------------------------------
Cómo gastar el crédito de Previaje en algunos comercios. La tarjeta precargada del Banco Nación se puede usar para comprar en locales y webs de los rubros habilitados
Average score: 0.3446509659290314
----------------------------------------------------------------------
Bienvenido a El Día de La Plata. La prensa argentina se reúne con nuestros usuarios para conocer las noticias locales
Average score: 0.25262933373451235
----------------------------------------------------------------------
La violencia vuelve a poner en peligro a Nelson Daniel Peralta. El asesinato de Morena Domínguez y al

Falso negativo
- El gobierno secuestró 3.800 kilogramos de marihuana en San Vicente. El Ejército y la Guardia Civil secuestaron un cargamento de más de 3.800 kilogramos de marihuana
Average score: 0.5756970793008804

# Similarity with links

In [168]:
import polars as pl
df = pl.read_csv('df_clean.csv', ignore_errors=True)

In [169]:
df.columns

['date_extract',
 'date_article',
 'topic',
 'content',
 'link',
 'titles',
 'sumaries',
 'authors',
 'portal',
 'state',
 'city',
 'content_hash',
 'content_nchar',
 'content_cleaned',
 'sumary']

In [171]:
links = df['link'].to_list()

In [187]:
topics = pl.read_csv('/home/sebacastillo/willow/data/topics.csv')
topics = topics['keywords'].to_list()
topics

['narcotráfico',
 'drogas',
 'cocaína',
 'marihuana',
 'heroína',
 'anfetaminas',
 'éxtasis',
 'traficante',
 'narcos',
 'narco',
 'estupefacientes',
 'incautación',
 'dealer',
 'mula',
 'clandestino',
 'cargamento']

In [191]:
threshold = 0.45
positivos = 0
links_pos = []
links_neg = []

for s in summaries:
    print('-' * 70)
    print(s)
    text = s.lower()
    text = re.sub(r"https?://", "", text)  # remove http/https
    text = re.sub(r"[\W_]+", " ", text)  # replace non-alphanumeric characters with space
    text = re.sub(r"[0-9]", " ", text)  # replace numeric characters with space
    summary_w = [word for word in text.split() if word not in spanish_stopwords_spacy]
    
    total_scores = []
    for topic in topics:
        scores = []
        for word in summary_w:
            if word in wordvec:                
                similarity = wordvec.similarity(word, topic)
                scores.append(similarity)
        
        max_score = np.sort(scores)[::-1][:3].max() if scores else 0
        total_scores.append(max_score)

    average_score = sum(total_scores) / len(total_scores) if total_scores else 0
    print(f"Average score: {average_score}")
    if average_score >= threshold:
        print("_____POSITIVO_____")
        positivos += 1
        links_pos.append(s)
    else:
        links_neg.append(s)
print(positivos)

----------------------------------------------------------------------
https://www.elcordillerano.com.ar/noticias/2023/07/31/167023-ahorra-energia-y-dinero-al-utilizar-tu-lavarropa-en-el-momento-indicado
Average score: 0.35089994966983795
----------------------------------------------------------------------
http://www.clarin.com/servicios/previaje-2023-averiguar-dinero-queda-tarjeta-podra-usar_0_CRmszgmXYx.html
Average score: 0.34814098104834557
----------------------------------------------------------------------
https://www.eldia.com/nota/2023-8-13-2-8-3-el-cartel-del-medido-tapado-por-las-ramas-en-47-4-y-5-la-ciudad
Average score: 0.3181564752012491
----------------------------------------------------------------------
https://www.minutoneuquen.com/nacionales/2023/8/11/otro-crimen-en-el-conurbano-conmociona-al-pais-mataron-un-hombre-de-50-anos-en-un-asalto-336215.html
Average score: 0.39803275279700756
----------------------------------------------------------------------
https://

In [194]:
for link in links_pos:
    print(link)

https://www.rosario3.com/policiales/Gendarmeria-incauto-municiones-ilegales-que-trasladaban-en-camion-desde-Tucuman-20230814-0047.html
https://www.elterritorio.com.ar/noticias/2023/08/10/799515-incautaron-una-tonelada-de-marihuana-y-cocaina-en-puerto-libertad
http://www.clarin.com/policiales/sorprendente-perfil-estudiaba-economia-vivia-palermo-cayo-encapsular-mula-narco_0_otIja55Mr4.html
https://www.elcordillerano.com.ar/noticias/2023/08/10/167764-amenazaron-de-muerte-al-taxista-que-denuncio-la-mafia-narco-de-los-taxis
http://www.diarioeloranense.com.ar/index.php/deporte/3329-secuestran-mas-de-18-kilos-de-marihuana-y-detienen-a-un-ciudadan
https://www.elterritorio.com.ar/noticias/2023/08/09/799361-la-autopsia-al-nieto-de-robert-de-niro-revelo-el-coctel-de-drogas-que-provoco-su-muerte
https://www.diariopopular.com.ar/salud/cuando-estara-el-pais-la-droga-inyectable-bajar-peso-n733445
https://www.elcordillerano.com.ar/noticias/2023/06/26/164438-dia-internacional-contra-el-uso-indebido-y-e