# NER full processing

In [1]:
import polars as pl
import re 
import time
def view_string(long_string, chunk_size=100):     
    return [long_string[i:i+chunk_size] for i in range(0, len(long_string), chunk_size)]

In [2]:
df = pl.read_csv('/home/sebacastillo/willow/output/news_narcotráfico_related_2023-08-12_1735.csv')

In [3]:
null_counts = {col: df.filter(df[col].is_null()).shape[0] for col in df.columns}
print(null_counts)

{'date_extract': 0, 'date_article': 0, 'topic': 0, 'content': 0, 'link': 0, 'authors': 0, 'portal': 0, 'state': 0, 'city': 0}


In [4]:
# Process the dataframe and create the content_hash column
df_processed = (
    df.with_columns([
        pl.col('date_extract').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%s', strict=True),
        pl.col('date_article').str.slice(0, 10).str.strptime(pl.Date, format='%Y-%m-%d'),
        pl.col("content").str.n_chars().alias("content_nchar"),
        pl.col("content").hash().alias("content_hash"),
        # Add other columns/transformations as required
          # Replace new lines and tabs with a single space, then replace multiple spaces with a single space
        pl.col("content").str.replace_all(r"[\n\t]+", " ")
        .str.replace_all(r"\s{2,}", " ")
        .str.strip().str.strip().alias("content_cleaned"),  # Adjusted
    ])
)

# Group by the content_hash to find duplicates
df_duplicated = df_processed.groupby("content_hash").agg(pl.col("content_hash").count().alias("count"))

# Join this information back to the original dataframe and add the duplicated_content column
df_final = (
    df_processed
    .join(df_duplicated, on="content_hash")
    .with_columns([
        (pl.col('count') > 1).alias('duplicated_content')
    ])
)

In [5]:
df_final = df_final.filter(pl.col("duplicated_content") == False)

In [6]:
df_final.shape

(123, 14)

In [7]:
articles = df_final['content_cleaned'].to_list()

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_str = "IIC/mt5-spanish-mlsum"
tokenizer = AutoTokenizer.from_pretrained(model_str)
model = AutoModelForSeq2SeqLM.from_pretrained(model_str)

In [9]:
articles_summaries = []
articles_tokenized = []
for article in articles:
    input_ids = tokenizer(article, return_tensors="pt").input_ids
    articles_tokenized.append(input_ids)
    output_ids = model.generate(input_ids, max_new_tokens=200, penalty_alpha=0.6, top_k=4,)[0]
    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    articles_summaries.append(summary)

In [None]:
articles_summaries

In [11]:
summaries = pl.DataFrame(articles_summaries)
tokenz = pl.DataFrame(articles_tokenized)


In [15]:
print(len(summaries)), print(df_final.shape)

123
(123, 14)


(None, None)

In [17]:
df_final = df_final.with_columns(pl.Series("sumary", articles_summaries))   
   

In [18]:
len(articles_tokenized)

123

In [19]:
df_final = df_final.with_columns(pl.Series("tokenz", articles_tokenized))

In [20]:
df_final.columns

['date_extract',
 'date_article',
 'topic',
 'content',
 'link',
 'authors',
 'portal',
 'state',
 'city',
 'content_nchar',
 'content_hash',
 'content_cleaned',
 'count',
 'duplicated_content',
 'sumary',
 'tokenz']

In [24]:
#df_final = df_final.drop('tokenz')
df_final.write_parquet('news_clean.parquet')

In [25]:
df_final.write_csv('news_clean.csv')