### ETL Process

In [1]:
import pandas as pd

### News

In [2]:
news = pd.read_parquet('../../data/2024-07-08_news_data.parquet')
news.head()

Unnamed: 0,Title,Source,Publication Time,Author
0,Resumen de noticias de las elecciones en Franc...,CNN en Español,10:24:37,
1,Quién es Jean-Luc Mélenchon: el político de iz...,infobae,01:24:37,
2,"Macron pide a su primer ministro continuar ""po...",DW (Español),09:24:37,
3,La izquierda celebra en París su inesperada vi...,El Colombiano,22:24:37,
4,Gremios aseguran que no hay laboratorios que p...,El Colombiano,00:24:37,Andrés VillamizarDe Andrés Villamizar


In [15]:
## Remover Stopwords en español

from nltk.corpus import stopwords
#[w for w in news['Title'][1] not in stopwords.words("spanish")]
words = news['Title'][1].split()
#print([w for w in words if w not in stopwords.words("spanish")])
filtered_title = ' '.join([w for w in words if w not in stopwords.words("spanish")])
print(filtered_title)
#print(stopwords.words("spanish"))

Quién Jean-Luc Mélenchon: político izquierda quiere sacar Francia OTAN apoya chavismo


In [17]:
## Creando una función
from nltk.corpus import stopwords
import re
stop_words = set(stopwords.words("spanish"))

def clean_titles(df):
    def clean_text(text):
        text = re.sub(r'[^\w\s]', '', text)
        return text
    filtered_titles = df['Title'].apply(lambda title: ' '.join(
        [w for w in clean_text(title).split() if w.lower() not in stop_words]
    ))

    return filtered_titles

In [18]:
clean_titles(news.head(3))

0    Resumen noticias elecciones Francia 2024 resul...
1    Quién JeanLuc Mélenchon político izquierda qui...
2        Macron pide primer ministro continuar momento
Name: Title, dtype: object

In [47]:
## Remove Duplicate Names Authors

def clean_author(author):
    if not author:
        return author


    pattern = re.compile(r'(.*?)De\s\1', re.IGNORECASE)
    author = re.sub(pattern, r'\1', author)


    words = author.split()
    half_length = len(words) // 2
    if words[:half_length] == words[half_length:]:
        return ' '.join(words[:half_length])

    return author


In [49]:
news['Author'].head(20).apply(clean_author)

0                                                  None
1                                                  None
2                                                  None
3                                                  None
4                                     Andrés Villamizar
5                                                  None
6                                                  None
7                                                  None
8                                                  None
9                                                  None
10    Victoria Butenko, Maria Kostenko y Daria Taras...
11                                        Alberto Rojas
12                                                 None
13                                                 None
14                                  Luis Carvajal Basto
15                                                 None
16                                                 None
17                                     Angie Rui

### Clean Dolar

In [50]:
dolar =  pd.read_parquet('../../data/2024-07-08_exchange_rate.parquet')
dolar

Unnamed: 0,Date,Hour,Exchange_rate
0,2024-07-08,17:24:42.898065,"$ 4.096,09"


In [53]:
def clean_time(time_obj):
    return time_obj.strftime('%H:%M:%S')

In [54]:
dolar['Hour'].apply(clean_time)

0    17:24:42
Name: Hour, dtype: object

In [64]:
def clean_value(value_str):
    value_str = value_str.replace('$', '').replace('.', '')
    value_str = value_str.split(',')[0]
    return int(value_str)


In [65]:
dolar['Exchange_rate'].apply(clean_value)

0    4096
Name: Exchange_rate, dtype: int64