In [1]:
import pandas as pd
import re
from datetime import date
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, ('es_ES', 'UTF-8'))

'es_ES.UTF-8'

In [2]:
df = pd.read_csv("../../data/santander_news.csv")

# GENERAL

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link       175 non-null    object
 1   title      175 non-null    object
 2   date       175 non-null    object
 3   summary    175 non-null    object
 4   paragraph  175 non-null    object
dtypes: object(5)
memory usage: 7.0+ KB


In [4]:
df.describe(include='object')

Unnamed: 0,link,title,date,summary,paragraph
count,175,175,175,175,175
unique,175,175,157,175,175
top,https://www.santander.com/es/sala-de-comunicac...,Banco Santander Colombia realizó con éxito su ...,2020-04-28,Banco Santander anunció que completó con éxito...,"Bogotá, 16 de octubre de 2025. Banco Santander..."
freq,1,1,3,1,1


Calculates the number of missing (null/NaN) values in each column of the DataFrame df. It returns a Series where the index is the column name and the value is the count of nulls in that column.

In [5]:
df.isnull().sum()

link         0
title        0
date         0
summary      0
paragraph    0
dtype: int64

Counts the number of duplicate rows in the DataFrame df. It returns an integer representing how many rows are exact duplicates of previous rows. This helps you identify if your dataset has repeated entries.

In [6]:
df.duplicated().sum()

np.int64(0)

# LINKS

Check invalid links

In [7]:
df['link'].apply(lambda x: not x.startswith('http')).sum()

np.int64(0)

Check duplicated links

In [8]:
df['link'].duplicated().sum()

np.int64(0)

Remove duplicates

In [9]:
df = df.drop_duplicates(subset='link')

# TEXTS

In [10]:
paragraphs_na = df[df["paragraph"].isna()]["link"]
print(paragraphs_na)

Series([], Name: link, dtype: object)


In [11]:
df.describe()

Unnamed: 0,link,title,date,summary,paragraph
count,175,175,175,175,175
unique,175,175,157,175,175
top,https://www.santander.com/es/sala-de-comunicac...,Banco Santander Colombia realizó con éxito su ...,2020-04-28,Banco Santander anunció que completó con éxito...,"Bogotá, 16 de octubre de 2025. Banco Santander..."
freq,1,1,3,1,1


In [12]:
df['title_len'] = df['title'].str.len()
df['summary_len'] = df['summary'].str.len()
df['paragraph_len'] = df['paragraph'].str.len()

df[['title_len', 'summary_len', 'paragraph_len']].describe()

Unnamed: 0,title_len,summary_len,paragraph_len
count,175.0,175.0,175.0
mean,99.457143,248.04,339.16
std,27.525895,100.980956,168.102861
min,33.0,72.0,59.0
25%,80.5,162.0,224.0
50%,100.0,241.0,318.0
75%,117.0,320.5,416.5
max,180.0,550.0,888.0


Most of the paragraphs start with the city (or cities) and the date from which the article is about. For example: "Ciudad de México, 2 de julio de 2025". 

In [13]:
df['paragraph'].head()

0    Bogotá, 16 de octubre de 2025. Banco Santander...
1    Ciudad de México, 16 de octubre de 2025. Grupo...
2    Banco Santander, en colaboración con la plataf...
3    Ciudad de México, 7 de octubre de 2025. El rec...
4    Ciudad de México, 2 de octubre de 2025. Con es...
Name: paragraph, dtype: object

Here I look for any posibilities to clean the paragraphs from these dates + cities combinations. I had to include different combinations using REGEX.

In [25]:
def clean_paragraph_format(text):
    text = re.sub(r'(NOTA DE PRENSA)|(– PRESS RELEASE)|( - )', '', text)
    text = re.sub(r'(^Montevideo, Uruguay,)|(São Paulo, 24 March 2020)', '', text)    
    text = text.replace('°',"").strip()    
    match = re.search(r'^.*,\s*(\w*\sde\s\w*\s|\w*\s|\w*\s*\w*|\w*\s\d*\s|\d*\s)(de|del|,)\s(\d*|\w*\s\d*)(\.|\s*[-:–—]|\s)', text)
    if match:
        text = text[match.end():].strip()
        text = re.sub(r'(-+)|(—)', '', text)
    else:
        match = re.search(r'^(\w*\s\d*\sde\s\w*\sde\s\d*\.|\d*\sde\s\w*\sde\s\d*\.|\d*\s\w*\s\d*\.|\w*\s\d*\sde\s\w*\,\s\d*\.|\d*\sde\s\w*\s\d*\.)', text)        
        if match:
            text = text[match.end():].strip()
            text = re.sub(r'(-+)|(—)', '', text)
    return text

In [26]:
new_paragraphs = []
for paragraph in df['paragraph']:
    new_paragraphs.append(clean_paragraph_format(paragraph))

df['new_paragraph'] = new_paragraphs

In [27]:
df['new_paragraph_len'] = df['new_paragraph'].str.len()
df[['new_paragraph_len']].describe()

Unnamed: 0,new_paragraph_len
count,175.0
mean,319.965714
std,159.110996
min,59.0
25%,207.5
50%,300.0
75%,390.0
max,847.0


In [28]:
df['new_summary'] = df['summary'].fillna('')
df['new_paragraph'] = df['new_paragraph'].fillna('')

In [29]:
df['new_summary_paragraph'] = df.apply(
    lambda row: row['new_summary'].lower() in row['new_paragraph'].lower() if row['new_summary'] != '' else False, axis=1)

df['new_summary_paragraph'].mean()

np.float64(0.38285714285714284)

In [30]:
df['new_summary'] = df.apply(
    lambda row: '' if row['new_summary_paragraph'] == True else row['new_summary'], axis=1)