# NER full processing

In [106]:
import polars as pl
import re 
import time
def view_string(long_string, chunk_size=100):     
    return [long_string[i:i+chunk_size] for i in range(0, len(long_string), chunk_size)]

In [107]:
df = pl.read_csv('/home/sebacastillo/willow/output/news_narcotráfico_related_2023-08-12_1735.csv')

In [108]:
null_counts = {col: df.filter(df[col].is_null()).shape[0] for col in df.columns}
print(null_counts)

{'date_extract': 0, 'date_article': 0, 'topic': 0, 'content': 0, 'link': 0, 'authors': 0, 'portal': 0, 'state': 0, 'city': 0}


In [109]:
# Process the dataframe and create the content_hash column
df_processed = (
    df.with_columns([
        pl.col('date_extract').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%s', strict=True),
        pl.col('date_article').str.slice(0, 10).str.strptime(pl.Date, format='%Y-%m-%d'),
        pl.col("content").str.n_chars().alias("content_nchar"),
        pl.col("content").hash().alias("content_hash"),
        # Add other columns/transformations as required
          # Replace new lines and tabs with a single space, then replace multiple spaces with a single space
        pl.col("content").str.replace_all(r"[\n\t]+", " ")
        .str.replace_all(r"\s{2,}", " ")
        .str.strip().str.strip().alias("content_cleaned"),  # Adjusted
    ])
)

# Group by the content_hash to find duplicates
df_duplicated = df_processed.groupby("content_hash").agg(pl.col("content_hash").count().alias("count"))

# Join this information back to the original dataframe and add the duplicated_content column
df_final = (
    df_processed
    .join(df_duplicated, on="content_hash")
    .with_columns([
        (pl.col('count') > 1).alias('duplicated_content')
    ])
)

In [110]:
df_final = df_final.filter(pl.col("duplicated_content") == False)

In [111]:
df_final.head()

date_extract,date_article,topic,content,link,authors,portal,state,city,content_nchar,content_hash,content_cleaned,count,duplicated_content
date,date,str,str,str,str,str,str,str,u32,u64,str,u32,bool
2023-08-12,2023-08-11,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",8715,7516608243946126849,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",7451,14334825721934162832,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",3157,18231986224012950007,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",3132,10929798021160889508,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",8401,16553782415713163754,"""Estimado lecto…",1,False


In [112]:
view_string(df_final['content_cleaned'][0])

['Estimado lector, muchas gracias por su interés en nuestras notas. Hemos incorporado el registro con ',
 'el objetivo de mejorar la información que le brindamos de acuerdo a sus intereses. Para más informac',
 'ión haga clic aquí Suplementos > Avisos > Servicios > Entretenimientos > Otros sitios > 11 °C La Pla',
 'ta Sabado 12 de Agosto, 2023 Mariano Pérez de Eulate mpeulate@eldia.com Todo el mundillo político se',
 ' preguntaba ayer si el crimen de la pequeña Morena Domínguez está destinado a convertirse en uno de ',
 'esos hitos revulsivos que pueden cambiar el curso de la historia o influir de un modo decisivo en el',
 'la. La atrocidad de lo ocurrido en Lanús se nacionalizó e indefectiblemente invadió la campaña elect',
 'oral de cara a las Primarias Abiertas del domingo. La mesura fue la reacción obligada de la política',
 ', bajando actos públicos previstos para ayer. En verdad, no había otra opción para los candidatos má',
 's que ese gesto de respeto. Salvo que prefirieran que

In [116]:
import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

# Load Spanish stopwords from spacy
nlp = spacy.load("es_core_news_sm")
spanish_stopwords_spacy = spacy.lang.es.stop_words.STOP_WORDS

def remove_stopwords(text):
    """Remove Spanish stopwords from the text."""
    return " ".join([word for word in text.split() if word not in spanish_stopwords_spacy])

def langchain_chunk_text(text):
    
    # Remove stopwords
    # text = remove_stopwords(text)
    
    # Define a basic whitespace tokenizer
    def whitespace_tokenizer(text):
        return text.split()

    custom_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=60,
        length_function = len,
        #length_function=lambda x: len(whitespace_tokenizer(x))
    )
    
    # Create the chunks using langchain
    documents = custom_text_splitter.create_documents([text])
    
    # Extract the text content from the resulting Document objects
    chunks = [doc.page_content for doc in documents]
    
    return chunks


def ner_on_large_document(text):
    nlp_ner = pipeline(
        "ner",
        model="mrm8488/bert-spanish-cased-finetuned-ner",
        #model="PlanTL-GOB-ES/roberta-base-bne-capitel-ner",
        #tokenizer=('mrm8488/bert-spanish-cased-finetuned-ner', {"use_fast": False}),
        #aggregation_strategy="simple"
        aggregation_strategy="max" # more precision!
        #aggregation_strategy="average"
    )

    chunks = langchain_chunk_text(text)
    all_ner_results = []

    for chunk in chunks:
        ner_results = nlp_ner(chunk)
        all_ner_results.extend(ner_results)

    return all_ner_results

In [142]:
df_small = df_final[0:3]
df_small

date_extract,date_article,topic,content,link,authors,portal,state,city,content_nchar,content_hash,content_cleaned,count,duplicated_content
date,date,str,str,str,str,str,str,str,u32,u64,str,u32,bool
2023-08-12,2023-08-11,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",8715,7516608243946126849,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",7451,14334825721934162832,"""Estimado lecto…",1,False
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",3157,18231986224012950007,"""Estimado lecto…",1,False


In [170]:
index = list(range(1, df_small.shape[0]+1))
index

[1, 2, 3]

In [147]:
df_small = df_small.with_columns(pl.Series('index', index))

In [148]:
df_small

date_extract,date_article,topic,content,link,authors,portal,state,city,content_nchar,content_hash,content_cleaned,count,duplicated_content,index
date,date,str,str,str,str,str,str,str,u32,u64,str,u32,bool,i64
2023-08-12,2023-08-11,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",8715,7516608243946126849,"""Estimado lecto…",1,False,1
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",7451,14334825721934162832,"""Estimado lecto…",1,False,2
2023-08-12,2023-08-12,"""narcotráfico""","""Estimado lecto…","""https://www.el…","""Diario El Dia …","""https://www.lm…","""Buenos Aires""","""La Plata""",3157,18231986224012950007,"""Estimado lecto…",1,False,3


In [158]:
print(articles), print(index)

['Estimado lector, muchas gracias por su interés en nuestras notas. Hemos incorporado el registro con el objetivo de mejorar la información que le brindamos de acuerdo a sus intereses. Para más información haga clic aquí Suplementos > Avisos > Servicios > Entretenimientos > Otros sitios > 11 °C La Plata Sabado 12 de Agosto, 2023 Mariano Pérez de Eulate mpeulate@eldia.com Todo el mundillo político se preguntaba ayer si el crimen de la pequeña Morena Domínguez está destinado a convertirse en uno de esos hitos revulsivos que pueden cambiar el curso de la historia o influir de un modo decisivo en ella. La atrocidad de lo ocurrido en Lanús se nacionalizó e indefectiblemente invadió la campaña electoral de cara a las Primarias Abiertas del domingo. La mesura fue la reacción obligada de la política, bajando actos públicos previstos para ayer. En verdad, no había otra opción para los candidatos más que ese gesto de respeto. Salvo que prefirieran quedar en el centro de una oleada de descrédito.

(None, None)

In [165]:
articles = df_small['content_cleaned'].to_list()

for article, i in zip(articles, index):
    ner = ner_on_large_document(article)
    df = pl.DataFrame(ner)
    df_len = df.shape[0]
    df = df.with_columns(pl.Series("index", [i] * df_len))    
    print(df.head())
    df_combined = pl.vstack([df, df])    

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


shape: (5, 6)
┌──────────────┬──────────┬─────────────────────────┬───────┬─────┬───────┐
│ entity_group ┆ score    ┆ word                    ┆ start ┆ end ┆ index │
│ ---          ┆ ---      ┆ ---                     ┆ ---   ┆ --- ┆ ---   │
│ str          ┆ f64      ┆ str                     ┆ i64   ┆ i64 ┆ i64   │
╞══════════════╪══════════╪═════════════════════════╪═══════╪═════╪═══════╡
│ LOC          ┆ 0.999281 ┆ La Plata                ┆ 294   ┆ 302 ┆ 1     │
│ PER          ┆ 0.998932 ┆ Mariano Pérez de Eulate ┆ 329   ┆ 352 ┆ 1     │
│ MISC         ┆ 0.61056  ┆ eldia                   ┆ 362   ┆ 367 ┆ 1     │
│ MISC         ┆ 0.86165  ┆ Eulate mpeulate         ┆ 3     ┆ 18  ┆ 1     │
│ MISC         ┆ 0.76197  ┆ eldia                   ┆ 19    ┆ 24  ┆ 1     │
└──────────────┴──────────┴─────────────────────────┴───────┴─────┴───────┘


Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


shape: (5, 6)
┌──────────────┬──────────┬──────────────────┬───────┬─────┬───────┐
│ entity_group ┆ score    ┆ word             ┆ start ┆ end ┆ index │
│ ---          ┆ ---      ┆ ---              ┆ ---   ┆ --- ┆ ---   │
│ str          ┆ f64      ┆ str              ┆ i64   ┆ i64 ┆ i64   │
╞══════════════╪══════════╪══════════════════╪═══════╪═════╪═══════╡
│ LOC          ┆ 0.999264 ┆ La Plata         ┆ 294   ┆ 302 ┆ 2     │
│ PER          ┆ 0.999134 ┆ Juan Carlos Cruz ┆ 67    ┆ 83  ┆ 2     │
│ LOC          ┆ 0.999522 ┆ Morón            ┆ 216   ┆ 221 ┆ 2     │
│ ORG          ┆ 0.999753 ┆ Policía          ┆ 305   ┆ 312 ┆ 2     │
│ ORG          ┆ 0.999483 ┆ Télam            ┆ 386   ┆ 391 ┆ 2     │
└──────────────┴──────────┴──────────────────┴───────┴─────┴───────┘


Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


shape: (5, 6)
┌──────────────┬──────────┬───────────────────────────┬───────┬─────┬───────┐
│ entity_group ┆ score    ┆ word                      ┆ start ┆ end ┆ index │
│ ---          ┆ ---      ┆ ---                       ┆ ---   ┆ --- ┆ ---   │
│ str          ┆ f64      ┆ str                       ┆ i64   ┆ i64 ┆ i64   │
╞══════════════╪══════════╪═══════════════════════════╪═══════╪═════╪═══════╡
│ LOC          ┆ 0.999304 ┆ La Plata                  ┆ 294   ┆ 302 ┆ 3     │
│ PER          ┆ 0.996415 ┆ Francis Castillo Yoe Caro ┆ 374   ┆ 399 ┆ 3     │
│ PER          ┆ 0.999175 ┆ Francis Castillo Yoe Caro ┆ 33    ┆ 58  ┆ 3     │
│ ORG          ┆ 0.99974  ┆ Policía                   ┆ 118   ┆ 125 ┆ 3     │
│ LOC          ┆ 0.989769 ┆ Cabezas                   ┆ 213   ┆ 220 ┆ 3     │
└──────────────┴──────────┴───────────────────────────┴───────┴─────┴───────┘


In [168]:
# Initialize an empty DataFrame
df_combined = pl.DataFrame()

for article, i in zip(articles, index):
    # Assume ner_on_large_document is a function that processes the article
    ner = ner_on_large_document(article)
    df = pl.DataFrame(ner)
    df_len = df.shape[0]
    df = df.with_columns(pl.Series("index", [i] * df_len))   
    # Vertically stack the generated df to the df_combined
    df_combined = pl.concat([df_combined, df])

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.po

In [169]:
df_combined.head()

entity_group,score,word,start,end,index
str,f64,str,i64,i64,i64
"""LOC""",0.999281,"""La Plata""",294,302,1
"""PER""",0.998932,"""Mariano Pérez …",329,352,1
"""MISC""",0.61056,"""eldia""",362,367,1
"""MISC""",0.86165,"""Eulate mpeulat…",3,18,1
"""MISC""",0.76197,"""eldia""",19,24,1


Processin all with loop

In [172]:
index = list(range(1, df_final.shape[0]+1))
articles = df_final['content_cleaned'].to_list()

In [174]:
# Initialize an empty DataFrame
df_combined = pl.DataFrame()

for article, i in zip(articles, index):
    # Assume ner_on_large_document is a function that processes the article
    ner = ner_on_large_document(article)
    df = pl.DataFrame(ner)
    df_len = df.shape[0]
    df = df.with_columns(pl.Series("index", [i] * df_len))  
    # Check if the columns of df match those of df_combined or if df_combined is still empty
    if set(df.columns) == set(df_combined.columns) or df_combined.shape[1] == 0:
        # Vertically stack the generated df to the df_combined
        df_combined = pl.concat([df_combined, df])

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.po

In [None]:
df_combined.write_csv('ner.csv')