# New filter on summary

In [2]:
import polars as pl
df = pl.read_csv('df_clean.csv', ignore_errors=True)
df.head()

date_extract,date_article,topic,content,link,titles,sumaries,authors,portal,state,city,content_hash,content_nchar,content_cleaned,sumary
str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str
"""2023-08-14""","""2023-07-31""","""narcotráfico""","""Para lograr un…","""https://www.el…","""Ahorra energía…","""Lavar la ropa …","""n-a""","""https://www.lm…","""Río Negro""","""Bariloche""",8.469788906430179e+18,3503,"""Para lograr un…","""El consejo par…"
"""2023-08-14""","""2023-08-14""","""narcotráfico""","""Dólar blue hoy…","""http://www.cla…","""Previaje 2023:…","""Previaje 2023:…","""Martín Grosz""","""https://www.lm…","""CABA""","""Buenos Aires""",,7350,"""Dólar blue hoy…","""Cómo gastar el…"
"""2023-08-14""","""2023-08-13""","""narcotráfico""","""Estimado lecto…","""https://www.el…","""El cartel del …","""Diario El Día …","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",,2782,"""Estimado lecto…","""Bienvenido a E…"
"""2023-08-14""","""2023-08-11""","""narcotráfico""","""La violencia e…","""https://www.mi…","""Otro crimen en…","""Mientras Peral…","""Lucía Paz Gime…","""https://www.lm…","""Neuquén""","""Plottier""",,2706,"""La violencia e…","""La violencia v…"
"""2023-08-14""","""2023-08-14""","""narcotráfico""","""27°SAN LUIS - …","""https://www.el…","""Ordenaron 18 a…","""Los mismos efe…","""El Diario De L…","""https://www.lm…","""San Luis""","""San Luis""",,5520,"""27°SAN LUIS - …","""Tres detenidos…"


In [3]:
summaries = df['titles'].to_list()
summaries[0]


'Ahorra energía y dinero al utilizar tu lavarropa en el momento indicado'

In [4]:
keywords = pl.read_csv('/home/sebacastillo/willow/data/topics.csv')
keywords = keywords['keywords'].to_list()
keywords

['narcotráfico',
 'drogas',
 'cocaína',
 'marihuana',
 'heroína',
 'anfetaminas',
 'éxtasis',
 'traficante',
 'narcos',
 'narco',
 'estupefacientes',
 'incautación',
 'dealer',
 'mula',
 'clandestino',
 'cargamento']

In [5]:
import re
from nltk.stem.snowball import SnowballStemmer
from gensim.models.keyedvectors import KeyedVectors
from bs4 import BeautifulSoup
import string
import spacy
import numpy as np

# Stopword and stemer
stemmer = SnowballStemmer("spanish")
nlp = spacy.load("es_core_news_sm")
spanish_stopwords_spacy = spacy.lang.es.stop_words.STOP_WORDS

In [6]:
def compute_similarity(string, keywords, wordvec):
    """
    Compute the average similarity score between a list of topic words and all words in a string.
    Returns:
    - float: The average similarity score.
    """
    text = string.lower()
    text = re.sub(r"https?://", "", text)  # remove http/https
    text = re.sub(r"[\W_]+", " ", text)  # replace non-alphanumeric characters with space
    text = re.sub(r"[0-9]", " ", text)  # replace numeric characters with space
    string_words = [word for word in text.split() if word not in spanish_stopwords_spacy]

    total_scores = []
    for keyword in keywords:
        # Compute cosine similarity scores
        scores = []
        for word in string_words:
            if word in wordvec:
                similarity = wordvec.similarity(word, keyword)
                scores.append(similarity)

        max_scores = np.max(scores) if scores else 0.0
        total_scores.append(max_scores)

    # Return average score
    return sum(total_scores) / len(total_scores) if total_scores else 0.0


In [7]:
def load_embeddings(path="models/wiki.es.vec", limit=200000):
    """
    Load the word embeddings from the specified path.
    Args:
    - path (str): Path to the embeddings model.
    - limit (int): Limit the number of word vectors loaded.

    Returns:
    - KeyedVectors: Loaded word vectors.
    """
    return KeyedVectors.load_word2vec_format(path, limit=limit)

In [8]:
def filter_articles_with_similarity(
    strings, keywords, wordvectors, similarity_treshold
):
    
    s_second_match = []
    s_second_match_score = []

    for string in strings:

        similarity_score = compute_similarity(string, keywords, wordvectors)
        
        if similarity_score > similarity_treshold:
            s_second_match.append(string)
            s_second_match_score.append(similarity_score)

    return s_second_match, s_second_match_score

In [9]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = load_embeddings(path="/home/sebacastillo/willow/models/wiki.es.vec", limit=200000)

In [10]:

sumary_match, summary_match_score = filter_articles_with_similarity(
                summaries, keywords, word_vectors, 0.2
            )


## Los sumarios Detectados

In [11]:
for sum, score in zip(sumary_match, summary_match_score):
    if score > 0.5:
        print(f'SCORE: {score} - sumary: {sum}')

SCORE: 0.5908334627747536 - sumary: En una despensa de Posadas incautaron 131 dosis de cocaína: dos detenidos con poco más de 7 millones de pesos
SCORE: 0.6013110671192408 - sumary: Incautaron una tonelada de marihuana y cocaína en Puerto Libertad
SCORE: 0.5139567153528333 - sumary: Sorprendente perfil: estudiaba Economía, vivía en Palermo y cayó por encapsular a una mula narco
SCORE: 0.5635197144001722 - sumary: Imputaron y dictaron prisión preventiva para cinco integrantes de una organización narcocriminal señalados por el transporte de 423 kilos de cocaína
SCORE: 0.5036423578858376 - sumary: Secuestran más de 18 kilos de marihuana y detienen a un ciudadano
SCORE: 0.5570160690695047 - sumary: La autopsia al nieto de Robert De Niro reveló el cóctel de drogas que provocó su muerte
SCORE: 0.540751064196229 - sumary: Cuándo estará en el país la droga inyectable para bajar de peso
SCORE: 0.5605891076847911 - sumary: Día internacional contra el uso indebido y el tráfico de drogas: “El 90% 

# los sumarios de mayor puntaje que quedaron afuera 

In [12]:
for sum, score in zip(sumary_match, summary_match_score):
    if score > 0.44 and score <= 0.5:
        print(f'SCORE: {score} - sumary: {sum}')

SCORE: 0.4783011535182595 - sumary: Gendarmería incautó municiones ilegales que trasladaban en camión desde Tucumán
SCORE: 0.48086682707071304 - sumary: Amenazaron de muerte al taxista que denunció la mafia narco de los taxis
SCORE: 0.488741934299469 - sumary: Mataron a un conocido narcotraficante y a su pareja
SCORE: 0.48693016171455383 - sumary: Fuerte despliegue anti-narco en Resistencia: dos mujeres detenidas tras allanamiento en Villa Ercilia
SCORE: 0.4734446248039603 - sumary: Desbarataron un aguantadero y un kiosco narco en el barrio Ñu Porá
SCORE: 0.4742904109880328 - sumary: Condenaron a una pareja narco a varios años de prisión en Córdoba
SCORE: 0.4671014631167054 - sumary: Incautaron 50 cajas de municiones ilegales
SCORE: 0.4805230973288417 - sumary: Ex policía fue detenido por administrar un kiosco narco en Posadas


Conclusión: se podría fijar como criterio de inclución el promedio de los tres scrores: link, título, sumario. 