In [1]:
import pandas as pd
import re
from datetime import date
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, ('es_ES', 'UTF-8'))

'es_ES.UTF-8'

In [2]:
df = pd.read_csv("../../data/uala_news.csv")

# GENERAL

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link       84 non-null     object
 1   title      83 non-null     object
 2   date       83 non-null     object
 3   summary    78 non-null     object
 4   paragraph  84 non-null     object
dtypes: object(5)
memory usage: 3.4+ KB


In [4]:
df.describe(include='object')

Unnamed: 0,link,title,date,summary,paragraph
count,84,83,83,78,84
unique,84,81,82,77,84
top,https://www.uala.com.ar/prensa/uala-despegar-c...,6 de cada 10 personas piensan que para inverti...,"Buenos Aires, 9 de marzo de 2023.","La inversión es vista como positiva, pero aún ...","Buenos Aires, Argentina, 16 de septiembre de 2..."
freq,1,2,2,2,1


Calculates the number of missing (null/NaN) values in each column of the DataFrame df. It returns a Series where the index is the column name and the value is the count of nulls in that column.

In [5]:
df.isnull().sum()

link         0
title        1
date         1
summary      6
paragraph    0
dtype: int64

Counts the number of duplicate rows in the DataFrame df. It returns an integer representing how many rows are exact duplicates of previous rows. This helps you identify if your dataset has repeated entries.

In [6]:
df.duplicated().sum()

np.int64(0)

# LINKS

Check invalid links

In [7]:
df['link'].apply(lambda x: not x.startswith('http')).sum()

np.int64(0)

Check duplicated links

In [8]:
df['link'].duplicated().sum()

np.int64(0)

Remove duplicates

In [9]:
df = df.drop_duplicates(subset='link')

# DATE

Verify all dates have the following format: Month Day, Year (for example: Abr 2, 2024).

In [10]:
def verify_date_format(date):
    if pd.isna(date):  # Check for NaN/None values
        return True
    if not isinstance(date, str):  # Check if value is not a string
        return True
    return not bool(re.match(r'[a-zA-Z]+\s\d+\s.\s\d+', date))

df_bad_format = df[df['date'].apply(verify_date_format)]

df_bad_format["date"].apply(lambda x: print(x))

Buenos Aires, Argentina, 16 de septiembre de 2025.-
Buenos Aires, Argentina, 25 de agosto de 2025.-
Buenos Aires, Argentina, 17 de julio de 2025.-
Buenos Aires, Argentina, 2 de julio de 2025.-
Buenos Aires, Argentina, 24 de junio de 2025.-
Buenos Aires, Argentina, 14 de abril de 2025.-
Buenos Aires, Argentina, 10 de abril de 2025.-
Buenos Aires, Argentina, 3 de abril de 2025.-
Buenos Aires, 18 de marzo de 2025.-
Buenos Aires, 26 de febrero de 2025.-
Buenos Aires, 6 de febrero de 2025.-
Buenos Aires, 28 de enero de 2025.-
nan
Buenos Aires, 4 de diciembre de 2024.-
Buenos Aires, Argentina y Múnich, Alemania, 11 de noviembre de 2024
Buenos Aires, 23 de octubre de 2024.
Buenos Aires, 24 de septiembre de 2024.
Buenos Aires, 19 de septiembre de 2024.
Buenos Aires, 18 de septiembre de 2024.
Buenos Aires, 6 de septiembre de 2024.-
Buenos Aires, 2 de septiembre de 2024.-
Buenos Aires, 28 de agosto de 2024.-
Buenos Aires, 13 de agosto de 2024.-
Buenos Aires, 16 de julio de 2024.-
Buenos Aires, 1

0     None
1     None
2     None
3     None
4     None
      ... 
79    None
80    None
81    None
82    None
83    None
Name: date, Length: 84, dtype: object

Some dates where extracted with the text. The idea here is to extract only the date that is in the following format: Month Day, Year (for example: Abr 2, 2024) and change it to a date format.

In [11]:
#Clean dates
temp_dates = []
i = 0

df['paragraph'][i]

for date in df['date']:
    if pd.isna(date):
        new_date = ''
        temp_dates.append(new_date)
        i += 1
        continue
    if date == 'PIP':
        date = df['paragraph'][i]
    date = date.replace('°',"")
    new_date = re.findall(r'\d{1,2}\s\w*\s\w*\w\s\w*\s\d*|\d{1,2}\s\w*\s\w*\w\s\d*', date)[0]
    try:
        new_date = datetime.strptime(new_date, '%d de %B de %Y')
    except ValueError:
        try:
            new_date = datetime.strptime(new_date, '%d de %B %Y')
        except ValueError:
            new_date = datetime.strptime(new_date, '%d de %B del %Y')
    temp_dates.append(new_date)
    i += 1

df['new_dates'] = temp_dates

Just to verify

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link       84 non-null     object
 1   title      83 non-null     object
 2   date       83 non-null     object
 3   summary    78 non-null     object
 4   paragraph  84 non-null     object
 5   new_dates  84 non-null     object
dtypes: object(6)
memory usage: 4.1+ KB


# TEXTS

In [13]:
paragraphs_na = df[df["paragraph"].isna()]["link"]
print(paragraphs_na)

Series([], Name: link, dtype: object)


In [14]:
df.describe()

Unnamed: 0,link,title,date,summary,paragraph,new_dates
count,84,83,83,78,84,84
unique,84,81,82,77,84,77
top,https://www.uala.com.ar/prensa/uala-despegar-c...,6 de cada 10 personas piensan que para inverti...,"Buenos Aires, 9 de marzo de 2023.","La inversión es vista como positiva, pero aún ...","Buenos Aires, Argentina, 16 de septiembre de 2...",2025-02-26 00:00:00
freq,1,2,2,2,1,2


In [15]:
df['title_len'] = df['title'].str.len()
df['summary_len'] = df['summary'].str.len()
df['paragraph_len'] = df['paragraph'].str.len()

df[['title_len', 'summary_len', 'paragraph_len']].describe()

Unnamed: 0,title_len,summary_len,paragraph_len
count,83.0,78.0,84.0
mean,71.072289,209.858974,408.047619
std,24.450439,92.65839,120.51455
min,31.0,68.0,176.0
25%,56.0,133.75,315.5
50%,69.0,197.0,422.0
75%,86.5,265.0,482.0
max,137.0,492.0,656.0


Most of the paragraphs start with the city (or cities) and the date from which the article is about. For example: "Ciudad de México, 2 de julio de 2025". 

In [16]:
df['paragraph'].head()

0    Buenos Aires, Argentina, 16 de septiembre de 2...
1    Buenos Aires, Argentina, 25 de agosto de 2025....
2    Buenos Aires, Argentina, 17 de julio de 2025.-...
3    Buenos Aires, Argentina, 2 de julio de 2025.- ...
4    Buenos Aires, Argentina, 24 de junio de 2025.-...
Name: paragraph, dtype: object

Here I look for any posibilities to clean the paragraphs from these dates + cities combinations. I had to include different combinations using REGEX.

In [17]:
#Casos: 287, 276

def clean_paragraph_format(text):
    text = text.replace('°',"").strip()
    text = text.replace('Buenos Aires, 30 de abril 2024', "Buenos Aires, 30 de abril de 2024").strip()    
    text = re.sub(r'(Ciudad de México a)|(Ciudad de México, a)', 'Ciudad de México,', text)    
    match = re.search(r'^.*,\s*(\w*\sde\s\w*\s|\w*\s|\w*\s*\w*|\w*\s\d*\s)(de|del|,)\s\d*(\.|\s*[-:–—]|\s)', text)
    if match:
        text = text[match.end():].strip()
        text = re.sub(r'(-+)|(—)', '', text)
    return text

In [18]:
new_paragraphs = []
for paragraph in df['paragraph']:
    new_paragraphs.append(clean_paragraph_format(paragraph))    

df['new_paragraph'] = new_paragraphs

In [19]:
df['new_paragraph_len'] = df['new_paragraph'].str.len()
df[['new_paragraph_len']].describe()

Unnamed: 0,new_paragraph_len
count,84.0
mean,381.97619
std,119.13392
min,176.0
25%,283.75
50%,385.5
75%,447.5
max,619.0


In [20]:
df['new_summary'] = df['summary'].fillna('')
df['new_paragraph'] = df['new_paragraph'].fillna('')

In [22]:
df['new_summary_paragraph'] = df.apply(
    lambda row: row['new_summary'].lower() in row['new_paragraph'].lower() if row['new_summary'] != '' else False, axis=1)

df['new_summary_paragraph'].mean()

np.float64(0.09523809523809523)

In [None]:
df['new_summary'] = df.apply(
    lambda row: '' if row['new_summary_paragraph'] == True else row['new_summary'], axis=1)