In [1]:
import pandas as pd
import re
from datetime import date
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, ('es_ES', 'UTF-8'))

'es_ES.UTF-8'

In [2]:
df = pd.read_csv("../../data/uala_news.csv")

# GENERAL

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link       84 non-null     object
 1   title      83 non-null     object
 2   date       78 non-null     object
 3   summary    71 non-null     object
 4   paragraph  84 non-null     object
dtypes: object(5)
memory usage: 3.4+ KB


In [5]:
df.describe(include='object')

Unnamed: 0,link,title,date,summary,paragraph
count,84,83,78,71,84
unique,84,81,77,70,84
top,https://www.uala.com.ar/prensa/uala-despegar-c...,6 de cada 10 personas piensan que para inverti...,"Buenos Aires, 9 de marzo de 2023.","La inversión es vista como positiva, pero aún ...","Buenos Aires, Argentina, 16 de septiembre de 2..."
freq,1,2,2,2,1


Calculates the number of missing (null/NaN) values in each column of the DataFrame df. It returns a Series where the index is the column name and the value is the count of nulls in that column.

In [6]:
df.isnull().sum()

link          0
title         1
date          6
summary      13
paragraph     0
dtype: int64

Counts the number of duplicate rows in the DataFrame df. It returns an integer representing how many rows are exact duplicates of previous rows. This helps you identify if your dataset has repeated entries.

In [7]:
df.duplicated().sum()

np.int64(0)

# LINKS

Check invalid links

In [8]:
df['link'].apply(lambda x: not x.startswith('http')).sum()

np.int64(0)

Check duplicated links

In [9]:
df['link'].duplicated().sum()

np.int64(0)

Remove duplicates

In [10]:
df = df.drop_duplicates(subset='link')

# DATE

Verify all dates have the following format: Month Day, Year (for example: Abr 2, 2024).

In [None]:
def verify_date_format(date):
    if pd.isna(date):  # Check for NaN/None values
        return True
    if not isinstance(date, str):  # Check if value is not a string
        return True
    return not bool(re.match(r'[a-zA-Z]+\s\d+\s.\s\d+', date))

df_bad_format = df[df['date'].apply(verify_date_format)]

df_bad_format["date"].apply(lambda x: print(x))

Buenos Aires, Argentina, 16 de septiembre de 2025.-
Buenos Aires, Argentina, 25 de agosto de 2025.-
Buenos Aires, Argentina, 17 de julio de 2025.-
Buenos Aires, Argentina, 2 de julio de 2025.-
Buenos Aires, Argentina, 24 de junio de 2025.-
Buenos Aires, Argentina, 14 de abril de 2025.-
Buenos Aires, Argentina, 10 de abril de 2025.-
Buenos Aires, Argentina, 3 de abril de 2025.-
Buenos Aires, 18 de marzo de 2025.-
Buenos Aires, 26 de febrero de 2025.-
Buenos Aires, 6 de febrero de 2025.-
Buenos Aires, 28 de enero de 2025.-
nan
Buenos Aires, 4 de diciembre de 2024.-
Buenos Aires, Argentina y Múnich, Alemania, 11 de noviembre de 2024
Buenos Aires, 23 de octubre de 2024.
Buenos Aires, 24 de septiembre de 2024.
Buenos Aires, 19 de septiembre de 2024.
Buenos Aires, 18 de septiembre de 2024.
Buenos Aires, 6 de septiembre de 2024.-
Buenos Aires, 2 de septiembre de 2024.-
Buenos Aires, 28 de agosto de 2024.-
Buenos Aires, 13 de agosto de 2024.-
Buenos Aires, 16 de julio de 2024.-
Buenos Aires, 1

0     None
1     None
2     None
3     None
4     None
      ... 
79    None
80    None
81    None
82    None
83    None
Name: date, Length: 84, dtype: object

Some dates where extracted with the text. The idea here is to extract only the date that is in the following format: Month Day, Year (for example: Abr 2, 2024) and change it to a date format.

In [11]:
#Clean dates
temp_dates = []

for date in df['date']:
    new_date = re.findall(r'[a-zA-Z]+\s[0-9]+\s.\s[0-9]+', date)[0] 
    new_date = re.sub(r'([a-zA-Z]+)', r'\1.', new_date)
    new_date = datetime.strptime(new_date, '%b %d , %Y')    
    temp_dates.append(new_date)

df['new_dates'] = temp_dates

Just to verify

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   link       306 non-null    object        
 1   title      306 non-null    object        
 2   date       306 non-null    object        
 3   summary    306 non-null    object        
 4   paragraph  306 non-null    object        
 5   new_dates  306 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 14.5+ KB


# TEXTS

In [13]:
paragraphs_na = df[df["paragraph"].isna()]["link"]
print(paragraphs_na)

Series([], Name: link, dtype: object)


In [14]:
df.describe()

Unnamed: 0,new_dates
count,306
mean,2024-04-04 12:37:38.823529472
min,2022-05-16 00:00:00
25%,2023-09-21 00:00:00
50%,2024-04-16 12:00:00
75%,2024-11-11 18:00:00
max,2025-08-14 00:00:00


In [15]:
df['title_len'] = df['title'].str.len()
df['summary_len'] = df['summary'].str.len()
df['paragraph_len'] = df['paragraph'].str.len()

df[['title_len', 'summary_len', 'paragraph_len']].describe()

Unnamed: 0,title_len,summary_len,paragraph_len
count,306.0,306.0,306.0
mean,86.124183,155.22549,508.879085
std,25.982042,49.86544,409.975299
min,28.0,1.0,19.0
25%,67.25,119.0,366.5
50%,84.0,154.0,463.5
75%,100.75,192.75,563.25
max,175.0,253.0,5890.0


Meter aquí un nuevo código para limpiar los párrafos de las fechas que están al inicio, al final eso no se necesita.

## new_summary

Notice that one of the summaries has a length of 1. In his case, it consisted of just one character extracted from the scrapper process.

In [16]:
filtered_df = df[df['summary_len'] == 1]
print(filtered_df['summary'])

226    -
Name: summary, dtype: object


A new variable calle 'new_summary' was created to have the new information. 

In [17]:
df['new_summary'] = df['summary']
df.loc[df['summary_len'] == 1, 'new_summary'] = None

## new_paragraph

One of the paragraphs consisted of just one date. No any other information.

In [18]:
filtered_df = df[df['paragraph_len'] == 19]
print(filtered_df['paragraph'])

270    20 de abril de 2023
Name: paragraph, dtype: object


Most of the paragraphs start with the city (or cities) and the date from which the article is about. For example: "Ciudad de México, 2 de julio de 2025". 

In [19]:
df['paragraph'].head()

0    São Paulo, Brasil, agosto 14, 2025 – Nu Holdin...
1    São Paulo, Brasil – 12 de agosto de 2025 – Nub...
2    São Paulo, 5 de agosto de 2025 – Pix inauguró ...
3    Ciudad de México, 31 de julio de 2025 – Nu anu...
4    El programa ofrece oportunidades para trabajar...
Name: paragraph, dtype: object

Here I look for any posibilities to clean the paragraphs from these dates + cities combinations. I had to include different combinations using REGEX.

In [20]:
#Casos: 287, 276

def clean_paragraph_format(text):
    match = re.search(r'^.*,\s*(\w*\sde\s\w*\s|\w*\s|\w*\s*\w*|\w*\s\d*\s)(de|del|,)\s\d*(\.|\s*[-:–—])', text)
    if match:
        text = text[match.end():].strip()
    
    return text

In [21]:
new_paragraphs = []
for paragraph in df['paragraph']:
    new_paragraphs.append(clean_paragraph_format(paragraph))    

df['new_paragraph'] = new_paragraphs

Search if the original paragraph with a length of 19 was removed or not from the cleaning process.

In [22]:
filtered_df = df[df['new_paragraph'].str.len() == 19]
print(filtered_df['new_paragraph'])

270    20 de abril de 2023
Name: new_paragraph, dtype: object


It was not removed, so I proceed to removed.

In [23]:
df.loc[df['new_paragraph'].str.len() == 19, 'new_paragraph'] = None

In [24]:
df['new_summary_len'] = df['new_summary'].str.len()
df['new_paragraph_len'] = df['new_paragraph'].str.len()

df[['new_summary_len', 'new_paragraph_len']].describe()

Unnamed: 0,new_summary_len,new_paragraph_len
count,305.0,305.0
mean,155.731148,478.881967
std,49.155297,408.114341
min,41.0,41.0
25%,119.0,340.0
50%,154.0,431.0
75%,193.0,537.0
max,253.0,5849.0


In [25]:
df.loc[df['new_paragraph'].str.len() == 5849]['new_paragraph']

217    Nu México, empresa de finanzas digitales trans...
Name: new_paragraph, dtype: object

Some paragraphs have really long text. The reason for this is that the entire article was extracted, instead of just the first paragraph. Since most of the article have been properly extracted, for now I am going to leave this long paragraphs like this. To fix this I can either substitute this paragraph with summary or refine the code to prevent these situations.

In [26]:
df['new_summary'] = df['new_summary'].fillna('')
df['new_paragraph'] = df['new_paragraph'].fillna('')

In [27]:
(df['title'].str.lower() == df['new_summary'].str.lower()).mean()

np.float64(0.0032679738562091504)

In [28]:
df['new_summary_paragraph'] = df.apply(
    lambda row: row['new_summary'].lower() in row['new_paragraph'].lower(), axis=1)

df['new_summary_paragraph'].mean()

np.float64(0.049019607843137254)