In [1]:
import os
os.getcwd()

'/home/sebacastillo/willow/dev'

In [5]:
%cd /home/sebacastillo/willow

/home/sebacastillo/willow


In [8]:
import glob
from datetime import datetime
import polars as pl

def load_data():
    # Step 1: Get the list of all files with the specific pattern
    files = glob.glob('output/news_narcotráfico_related_*.csv')

    # Step 2: Extract dates and sort them
    files_sorted = sorted(files, key=lambda x: datetime.strptime(x.split('_')[-2] + '_' + x.split('_')[-1][:-4], '%Y-%m-%d_%H%M'))

    # Step 3: Read the latest file
    latest_file = files_sorted[0]
    df = pl.read_csv(latest_file, dtypes={'content_hash': pl.UInt64})

    return df

In [9]:
# Example usage
data = load_data()
print(data)

shape: (48, 14)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ date_extr ┆ date_arti ┆ topic     ┆ content   ┆ … ┆ state     ┆ city      ┆ content_h ┆ content_ │
│ act       ┆ cle       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ash       ┆ nchar    │
│ ---       ┆ ---       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ ---       ┆ ---      │
│ str       ┆ str       ┆           ┆           ┆   ┆           ┆           ┆ u64       ┆ i64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2023-08-1 ┆ null      ┆ narcotráf ┆ La        ┆ … ┆ Santiago  ┆ La Banda  ┆ 150828215 ┆ 1387     │
│ 7         ┆           ┆ ico       ┆ Dirección ┆   ┆ del       ┆           ┆ 581557643 ┆          │
│ 15:58:20  ┆           ┆           ┆ de Preven ┆   ┆ Estero    ┆           ┆ 53        ┆          │
│           ┆           ┆           ┆ ción      ┆   ┆           ┆          

In [3]:
from src.transform import *

In [4]:
import polars as pl

In [5]:
df = pl.read_csv('/home/sebacastillo/willow/output/news_narcotráfico_related_2023-08-16_1355.csv',
                 dtypes={'content_hash': pl.UInt64})

In [6]:
df.head(1)

date_extract,date_article,topic,content,link,link_sim_score,title,summary,authors,portal,state,city,content_hash,content_nchar
str,str,str,str,str,f64,str,str,str,str,str,str,u64,i64
"""2023-08-16 13:…","""2023-08-16T13:…","""narcotráfico""","""Seguinos Por L…","""https://www.la…",0.561929,"""Condenaron a s…","""Brian está det…","""n-a""","""https://www.ar…","""Santa Fe""","""Rosario""",9259959111029284149,4543


# Pecessing with functions

In [9]:
data = clean_dataframe(df)

In [10]:
data.head(1)

date_extract,date_article,topic,content,link,link_sim_score,title,summary,authors,portal,state,city,content_hash,content_nchar
date,date,str,str,str,f64,str,str,str,str,str,str,u64,i64
2023-08-16,2023-08-16,"""narcotráfico""","""Seguinos Por L…","""https://www.la…",0.561929,"""Condenaron a s…","""Brian está det…","""n-a""","""https://www.ar…","""Santa Fe""","""Rosario""",9259959111029284149,4543


# Summarize

In [11]:
# Define the model paths
summary_model_str = "IIC/mt5-spanish-mlsum"
#ner_model = "mrm8488/bert-spanish-cased-finetuned-ner"
word_vectors = load_embeddings(path="models/wiki.es.vec", limit=200000)
keywords = load_keywords(topic='narcotráfico')


In [12]:
data = summarize_articles(data, model_str=summary_model_str,keywords=keywords, word_vectors=word_vectors)

# Breakpoint: reset enviroment

In [1]:
import polars as pl
df = pl.read_csv('/home/sebacastillo/willow/output/news_summaries.csv',
                 dtypes={'content_hash': pl.UInt64})

In [2]:
df = (
            df.with_columns([
                pl.col('date_extract').str.strptime(pl.Date, format='%Y-%m-%d', strict=True),  # Fixed format
                pl.col('date_article').str.slice(0, 10).str.strptime(pl.Date, format='%Y-%m-%d'),                
            ])
        )

In [11]:
#data.write_csv('/home/sebacastillo/willow/output/news_summaries.csv')
df.shape

(44, 16)

In [4]:
%cd /home/sebacastillo/willow

/home/sebacastillo/willow


In [5]:
from src.transform import *

In [6]:
ner_model = "mrm8488/bert-spanish-cased-finetuned-ner"

In [9]:
ner_function = lambda text: ner_on_large_document(text) # Customize as needed

In [None]:
ner_news_df = calculate_ner(df, ner_function)

In [14]:
def arrange_datasets(news_df, ner_news_df):
    try:
        index = list(range(1, news_df.shape[0] + 1))
        news_df = news_df.with_columns(pl.Series("index", index))

        ner_news_df = ner_news_df.join(news_df[['link', 'content_hash', 'index']], on='index', how='left')

        arranged_news_df = news_df.select([
            'index', 'topic', 'date_extract', 'date_article', 'content', 'portal', 'link',
            'link_sim_score', 'title', 'summary', 'summary_llm',"summary_sim_score", 'authors',
            'state', 'city','content_hash', 'content_nchar'
        ])

        arranged_ner_df = ner_news_df.select(
            [
                'index', 'link', 'content_hash', 'entity_group', 'score', 'word', 'start', 'end'
            ]
        )

        return arranged_news_df, arranged_ner_df
    except Exception as e:
        print(f"An error occurred during dataset arrangement: {e}")
        return None, None

In [15]:
news, newsner = arrange_datasets(df, ner_news_df)

In [16]:
news.head(3)

index,topic,date_extract,date_article,content,portal,link,link_sim_score,title,summary,summary_llm,summary_sim_score,authors,state,city,content_hash,content_nchar
i64,str,date,date,str,str,str,f64,str,str,str,f64,str,str,str,u64,i64
1,"""narcotráfico""",2023-08-16,2023-08-16,"""Seguinos Por L…","""https://www.ar…","""https://www.la…",0.561929,"""Condenaron a s…","""Brian está det…","""Seis personas …",0.5241,"""n-a""","""Santa Fe""","""Rosario""",9259959111029284149,4543
2,"""narcotráfico""",2023-08-16,2023-08-16,"""Escuadrón 1 “R…","""https://www.ar…","""http://www.dia…",0.476911,"""Encuentran 32 …","""na""","""La Fiscalía Fe…",0.5945,"""Diario El Oran…","""Salta""","""Orán""",6046667906668190206,707
3,"""narcotráfico""",2023-08-16,2023-08-16,""". Compartir en…","""https://www.ar…","""https://www.ar…",0.526558,"""Prefectura sec…","""Efectivos de l…","""La Prefectura …",0.5717,"""n-a""","""Argentina""","""Argentina""",12244461015022324118,657


In [17]:
newsner.head(3)

index,link,content_hash,entity_group,score,word,start,end
i64,str,u64,str,f64,str,i64,i64
1,"""https://www.la…",9259959111029284149,"""PER""",0.9998,"""Leo Graciarena…",13,27
1,"""https://www.la…",9259959111029284149,"""PER""",0.999691,"""Elena Beatriz …",37,58
1,"""https://www.la…",9259959111029284149,"""ORG""",0.984321,"""Tribunal Oral …",72,95


In [18]:
news.write_csv('/home/sebacastillo/willow/output/news.csv')
newsner.write_csv('/home/sebacastillo/willow/output/newsner.csv')