# NER full processing

In [None]:
import polars as pl
import re 
import time
def view_string(long_string, chunk_size=100):     
    return [long_string[i:i+chunk_size] for i in range(0, len(long_string), chunk_size)]

In [2]:
df = pl.read_csv(   
    '/home/sebacastillo/willow/output/news_narcotráfico_related_2023-08-14_1522.csv',
    dtypes={"content_hash": pl.UInt64},     
)

# Cleaning febore write to DB

In [3]:
df_clean = (
    df.with_columns([
        pl.col('date_extract').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%s', strict=True),
        pl.col('date_article').str.slice(0, 10).str.strptime(pl.Date, format='%Y-%m-%d'),
        pl.col("content").str.n_chars().alias("content_nchar"),
        pl.col("content").hash().alias("content_hash"),
        # Add other columns/transformations as required
          # Replace new lines and tabs with a single space, then replace multiple spaces with a single space
        pl.col("content").str.replace_all(r"[\n\t]+", " ")
        .str.replace_all(r"\s{2,}", " ")
        .str.strip().str.strip().alias("content_cleaned"),  # Adjusted
    ])
)


In [4]:
df_clean.shape

(122, 14)

## Filtro con contenido escaso o nulo

In [5]:
# por extension menor a 400 caracteres: muy poca información
df_clean = (
    df_clean
    .filter(pl.col('content_nchar') > 400)
)

In [6]:
df_clean.shape

(119, 14)

# Agrego summary2_llm

In [16]:
articles = df_clean['content_cleaned'].to_list()
articles = articles[114:119]
for article in articles:
    print(article)

“María Marta: El crimen del country”, la miniserie de ocho capítulos que estrenó en julio del año pasado a través de la plataforma de streaming HBO Max y la señal HBO, y que con los protagónicos de Laura Novoa y Jorge Marrale ficcionaliza la historia del asesinato de María Marta García Belsunce, podrá verse en un canal de cable básico con su llegada el 15 de este mes a las 22 a TNT.De esta manera, la adaptación del recordado caso policial, que trascendió el 27 de octubre de 2002 en nuestro país, tendrá un nuevo paso por la pantalla luego de cosechar nominaciones en los Premios Platinos al Audiovisual Iberoamericano y en los Premios Cóndor de Plata.En “María Marta: El crimen del country”, el hecho es adaptado en capítulos de 45 minutos cada uno, que proponen un abordaje del caso que presenta nuevas perspectivas a raíz de un trabajo que implicó a múltiples fuentes y miradas que permiten dar un vistazo al mundo íntimo de María Marta y recrear lo que sucedió antes, durante y después de su 

In [None]:
#embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
    #                                   model_kwargs={'device': 'cpu'})
#embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-base',
#                                       model_kwargs={'device': 'cpu'})

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_str = "IIC/mt5-spanish-mlsum"
#model_Str = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_str)
model = AutoModelForSeq2SeqLM.from_pretrained(model_str)

In [17]:
articles_summaries = []
for article in articles:
    input_ids = tokenizer(article, return_tensors="pt").input_ids    
    output_ids = model.generate(input_ids, max_new_tokens=400, penalty_alpha=0.6, top_k=4,)[0]
    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    articles_summaries.append(summary)

: 

: 

In [12]:
view_string(articles_summaries[0])

['El consejo para ahorrar energía y agua al utilizar el lavarropa. La Federación Argentina de Cooperat',
 'ivas de Electricidad y Otros Servicios Públicos (FACE) debe dejar una serie de consejos para alcanza',
 'r la eficiencia energética']

In [14]:
print(len(articles_summaries)), print(df_clean.shape[0])

119
119


(None, None)

In [15]:
summaries = pl.DataFrame(articles_summaries)
df_clean = df_clean.with_columns(pl.Series("sumary", articles_summaries))  


In [16]:
df_clean.columns

['date_extract',
 'date_article',
 'topic',
 'content',
 'link',
 'titles',
 'sumaries',
 'authors',
 'portal',
 'state',
 'city',
 'content_hash',
 'content_nchar',
 'content_cleaned',
 'sumary']

In [17]:
df_clean.write_csv('df_clean.csv')

In [None]:
del tokenizer
del model
del articles_tokenized

In [21]:
import gc
gc.collect()

1440

In [6]:
df_clean['titles'].to_list()[:10]

['Ahorra energía y dinero al utilizar tu lavarropa en el momento indicado',
 'Visita a empresarios y organizaciones culturales: Federico Prieto en el departamento Colón',
 'Previaje 2023: cómo averiguar cuánto dinero queda en la tarjeta y hasta qué día se podrá usar',
 'El cartel del medido, tapado por las ramas en 47, 4 y 5',
 'Otro crimen en el conurbano conmociona al país: mataron a un hombre de 50 años en un asalto',
 'Ordenaron 18 allanamientos y detuvieron a tres acusados del crimen de Pérez Algaba',
 'En una despensa de Posadas incautaron 131 dosis de cocaína: dos detenidos con poco más de 7 millones de pesos',
 'La AFIP incautó 60 toneladas de soja valuada en más de 6 millones de pesos',
 '"El don del crimen" de Marco Lucchesi',
 'La agresión a una chica, la disputa de dos jóvenes y una ejecución en una calle de Pocito']

In [None]:
df_clean['sumary'].to_list()[:10]

In [4]:
import polars as pl
df = (
    pl.scan_csv("df_clean.csv")  # lazy, doesn't do a thing
    .select(
        ["sumary", "content_cleaned"]
    )  
)

In [8]:
df = df.collect()
df.write_csv('df_clean_contsum.csv')

Processin all with loop

In [172]:
index = list(range(1, df_final.shape[0]+1))
articles = df_final['content_cleaned'].to_list()

In [None]:
import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

# Load Spanish stopwords from spacy
nlp = spacy.load("es_core_news_sm")
spanish_stopwords_spacy = spacy.lang.es.stop_words.STOP_WORDS

def remove_stopwords(text):
    """Remove Spanish stopwords from the text."""
    return " ".join([word for word in text.split() if word not in spanish_stopwords_spacy])

def langchain_chunk_text(text):
    
    # Remove stopwords
    # text = remove_stopwords(text)
    
    # Define a basic whitespace tokenizer
    def whitespace_tokenizer(text):
        return text.split()

    custom_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=60,
        length_function = len,
        #length_function=lambda x: len(whitespace_tokenizer(x))
    )
    
    # Create the chunks using langchain
    documents = custom_text_splitter.create_documents([text])
    
    # Extract the text content from the resulting Document objects
    chunks = [doc.page_content for doc in documents]
    
    return chunks


def ner_on_large_document(text):
    nlp_ner = pipeline(
        "ner",
        model="mrm8488/bert-spanish-cased-finetuned-ner",
        #model="PlanTL-GOB-ES/roberta-base-bne-capitel-ner",
        #tokenizer=('mrm8488/bert-spanish-cased-finetuned-ner', {"use_fast": False}),
        #aggregation_strategy="simple"
        aggregation_strategy="max" # more precision!
        #aggregation_strategy="average"
    )

    chunks = langchain_chunk_text(text)
    all_ner_results = []

    for chunk in chunks:
        ner_results = nlp_ner(chunk)
        all_ner_results.extend(ner_results)

    return all_ner_results

In [None]:
index = list(range(1, df_final.shape[0]+1))
articles = df_final['content_cleaned'].to_list()

In [174]:
# Initialize an empty DataFrame
df_combined = pl.DataFrame()

for article, i in zip(articles, index):
    # Assume ner_on_large_document is a function that processes the article
    ner = ner_on_large_document(article)
    df = pl.DataFrame(ner)
    df_len = df.shape[0]
    df = df.with_columns(pl.Series("index", [i] * df_len))  
    # Check if the columns of df match those of df_combined or if df_combined is still empty
    if set(df.columns) == set(df_combined.columns) or df_combined.shape[1] == 0:
        # Vertically stack the generated df to the df_combined
        df_combined = pl.concat([df_combined, df])

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.po

In [None]:
df_combined.write_csv('ner.csv')