# UNIR CSVs

In [1]:
import pandas as pd
from dateutil import parser as date_parser

# Rutas a los archivos
path_newsapi = "/workspaces/final-project/data/raw/12.11.25-12.12.25.noticias_raw_sentimiento.csv"
path_cryptonews = "/workspaces/final-project/data/raw/cryptonews.csv"
path_sentiments = "/workspaces/final-project/data/raw/bitcoin_sentiments_21_24.csv"
path_titles = "/workspaces/final-project/data/raw/bitcoin_titles.csv"

# Cargar cada CSV
df_newsapi = pd.read_csv(path_newsapi)
df_cryptonews = pd.read_csv(path_cryptonews)
df_sentiments = pd.read_csv(path_sentiments)
df_titles = pd.read_csv(path_titles)

# Función para convertir cualquier fecha a formato ISO UTC como NewsAPI
def to_newsapi_date(date_str):
    if pd.isna(date_str):
        return None
    try:
        parsed = date_parser.parse(str(date_str))
        if parsed.tzinfo is None:
            parsed = parsed.replace(tzinfo=None)
        return parsed.strftime('%Y-%m-%dT%H:%M:%SZ')
    except:
        return None

# Estandarizar columna de fecha en cada dataset y renombrarla a 'publishedAt'
df_newsapi['publishedAt'] = df_newsapi['publishedAt'].apply(to_newsapi_date)

df_cryptonews['publishedAt'] = df_cryptonews['date'].apply(to_newsapi_date)

df_sentiments['publishedAt'] = df_sentiments['Date'].apply(to_newsapi_date)

df_titles['publishedAt'] = df_titles['Date'].apply(to_newsapi_date)

# Añadir columna 'axis' donde no exista (todos BTC)
for df in [df_cryptonews, df_sentiments, df_titles]:
    df['axis'] = 'BTC'

# Seleccionar y renombrar columnas para mantener consistencia mínima
# Mantengo todas las originales, solo aseguro que tengan publishedAt y axis

# Unir todos los dataframes
df_unified = pd.concat([df_newsapi, df_cryptonews, df_sentiments, df_titles], ignore_index=True)

# Eliminar filas sin fecha válida
df_unified = df_unified.dropna(subset=['publishedAt']).copy()

# Guardar el CSV unificado
output_path = "/workspaces/final-project/data/raw/unified_noticias_raw.csv"
df_unified.to_csv(output_path, index=False)

print(f"CSV unificado guardado en: {output_path}")
print(f"Total filas: {len(df_unified)}")
print("Columnas finales:", df_unified.columns.tolist())

CSV unificado guardado en: /workspaces/final-project/data/raw/unified_noticias_raw.csv
Total filas: 51897
Columnas finales: ['publishedAt', 'title', 'description', 'source', 'axis', 'date', 'sentiment', 'subject', 'text', 'url', 'Date', 'Short Description', 'Accurate Sentiments', 'Unnamed: 0', 'Title', 'Links']


In [2]:
import pandas as pd

df_unified = pd.read_csv("/workspaces/final-project/data/raw/unified_news_raw.csv")
df_unified.head()

  df_unified = pd.read_csv("/workspaces/final-project/data/raw/unified_news_raw.csv")


Unnamed: 0.1,publishedAt,title,description,source,axis,date,sentiment,subject,text,url,Date,Short Description,Accurate Sentiments,Unnamed: 0,Title,Links
0,2025-11-12T03:31:05Z,"Bitcoin Bottomed At $98,000, Analyst Says: Tim...",Bitcoin (CRYPTO: BTC) has likely established i...,Yahoo Entertainment,BTC,,,,,,,,,,,
1,2025-11-12T21:31:08Z,Bitcoin User Accidentally Pays Over $105K To S...,A Bitcoin (CRYPTO: BTC) user paid a staggering...,Yahoo Entertainment,BTC,,,,,,,,,,,
2,2025-11-12T19:00:00Z,Billionaire twins–backed stock surges 100% aft...,"Leap Therapeutics (Nasdaq: LPTX), a Cambridge,...",TheStreet,BTC,,,,,,,,,,,
3,2025-11-12T19:01:08Z,"Take Profits On Bitcoin—It's 'Fall Season', Mo...","Bitcoin (CRYPTO: BTC) tapped $105,000 before r...",Yahoo Entertainment,BTC,,,,,,,,,,,
4,2025-11-12T12:15:00Z,Demand Revival: Crypto Daybook Americas,"The day ahead in crypto: Nov. 12, 2025",CoinDesk,BTC,,,,,,,,,,,


In [4]:
import pandas as pd

# Ruta al CSV unificado
path = "/workspaces/final-project/data/raw/unified_news_raw.csv"

df = pd.read_csv(path)

print("Filas iniciales:", len(df))
print("Columnas iniciales:", df.columns.tolist())

# 1. Unificar todo el texto en una sola columna: text_nlp
# Fuentes posibles de título
title_cols = ['title', 'Title', 'Short Description']
title = df[title_cols].bfill(axis=1).iloc[:, 0]  # Toma la primera no nula

# Fuentes posibles de descripción/texto completo
desc_cols = ['description', 'text', 'Short Description']
desc = df[desc_cols].bfill(axis=1).iloc[:, 0].fillna('')

# Combinar título + descripción
df['text_nlp'] = (title.astype(str).str.strip() + " " + desc.astype(str).str.strip()).str.strip()

# 2. Combinar source + url en una sola columna
df['source_url'] = df['source'].astype(str).str.strip()
mask_url = df['url'].notna() | df['Links'].notna()
df.loc[mask_url, 'source_url'] = df.loc[mask_url, 'source_url'] + " - " + \
    df.loc[mask_url, ['url', 'Links']].bfill(axis=1).iloc[:, 0].astype(str)

df['source_url'] = df['source_url'].str.replace(' - nan', '', regex=False).str.strip()

# 3. Mantener sentiment precomputado (para comparar después)
sent_cols = ['sentiment', 'Accurate Sentiments']
df['precomputed_sentiment'] = df[sent_cols].bfill(axis=1).iloc[:, 0]

# 4. Eliminar columnas innecesarias
cols_to_drop = [
    'Unnamed: 0', 'subject', 'date', 'Date', 'url', 'Links',
    'title', 'Title', 'description', 'text', 'Short Description',
    'sentiment', 'Accurate Sentiments', 'source'
]
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

# 5. Reordenar columnas finales
df = df[['publishedAt', 'text_nlp', 'source_url', 'axis', 'precomputed_sentiment']]

# 6. Quitar duplicados por texto y fecha (evita repeticiones exactas)
df = df.drop_duplicates(subset=['text_nlp', 'publishedAt']).reset_index(drop=True)

# 7. Guardar versión limpia lista para análisis de sentimiento
output_path = "/workspaces/final-project/data/processed/unified_noticias_clean.csv"
df.to_csv(output_path, index=False)

print("\nLimpieza completada")
print("Filas finales:", len(df))
print("Columnas finales:", df.columns.tolist())
print("Guardado en:", output_path)
df.head(10)

  df = pd.read_csv(path)


Filas iniciales: 51897
Columnas iniciales: ['publishedAt', 'title', 'description', 'source', 'axis', 'date', 'sentiment', 'subject', 'text', 'url', 'Date', 'Short Description', 'Accurate Sentiments', 'Unnamed: 0', 'Title', 'Links']


  df['precomputed_sentiment'] = df[sent_cols].bfill(axis=1).iloc[:, 0]



Limpieza completada
Filas finales: 51742
Columnas finales: ['publishedAt', 'text_nlp', 'source_url', 'axis', 'precomputed_sentiment']
Guardado en: /workspaces/final-project/data/processed/unified_noticias_clean.csv


Unnamed: 0,publishedAt,text_nlp,source_url,axis,precomputed_sentiment
0,2025-11-12T03:31:05Z,"Bitcoin Bottomed At $98,000, Analyst Says: Tim...",Yahoo Entertainment,BTC,
1,2025-11-12T21:31:08Z,Bitcoin User Accidentally Pays Over $105K To S...,Yahoo Entertainment,BTC,
2,2025-11-12T19:00:00Z,Billionaire twins–backed stock surges 100% aft...,TheStreet,BTC,
3,2025-11-12T19:01:08Z,"Take Profits On Bitcoin—It's 'Fall Season', Mo...",Yahoo Entertainment,BTC,
4,2025-11-12T12:15:00Z,Demand Revival: Crypto Daybook Americas The da...,CoinDesk,BTC,
5,2025-11-12T02:23:11Z,Asia Morning Briefing: Bitcoin ETFs Pull In $3...,CoinDesk,BTC,
6,2025-11-12T18:42:46Z,Daily Deal: Costco 1-Year Gold Star Membership...,Techdirt,BTC,
7,2025-11-12T21:38:20Z,A $25 Billion Bitcoin Bet Frays as Doubt Hits ...,Yahoo Entertainment,BTC,
8,2025-11-12T02:17:22Z,The first partner of California donned a strik...,Daily Beast,BTC,
9,2025-11-12T08:04:03Z,I'm convinced — Fiio FT7 is the flagship kille...,Android Central,BTC,


In [5]:
# === LIMPIEZA FINAL PARA NLP (ELIMINA VACÍAS Y DUPLICADAS) ===

import pandas as pd

# Ruta al CSV limpio anterior
path = "/workspaces/final-project/data/processed/unified_noticias_clean.csv"

df = pd.read_csv(path)

print("Filas iniciales:", len(df))

# 1. Convertir publishedAt a datetime UTC
df['publishedAt'] = pd.to_datetime(df['publishedAt'], utc=True, errors='coerce')

# Eliminar filas sin fecha válida
df = df.dropna(subset=['publishedAt']).copy()

# 2. Limpiar text_nlp: asegurar string, quitar espacios y eliminar filas sin texto
df['text_nlp'] = df['text_nlp'].astype(str).str.strip()
df = df[df['text_nlp'] != ''].copy()
df = df[df['text_nlp'] != 'nan'].copy()

# 3. Limpiar source_url (quitar restos)
df['source_url'] = df['source_url'].astype(str).str.replace('nan', '', regex=False).str.strip()

# 4. Crear bloque de 1 hora
df['datetime_1h'] = df['publishedAt'].dt.floor('1h')

# 5. Eliminar duplicados por texto + hora (más estricto)
df = df.drop_duplicates(subset=['text_nlp', 'datetime_1h']).reset_index(drop=True)

# 6. Eliminar filas donde text_nlp sea muy corto (menos de 10 caracteres, probablemente basura)
df = df[df['text_nlp'].str.len() >= 10].copy()

# 7. Reset index final
df = df.reset_index(drop=True)

# 8. Guardar versión lista para NLP
output_path = "/workspaces/final-project/data/processed/unified_noticias_nlp_ready.csv"
df.to_csv(output_path, index=False)

print("\nLimpieza NLP completada")
print("Filas finales:", len(df))
print("Rango temporal:", df['publishedAt'].min(), "→", df['publishedAt'].max())
print("Guardado en:", output_path)

display(df[['publishedAt', 'text_nlp', 'source_url', 'axis']].head(10))

Filas iniciales: 51742

Limpieza NLP completada
Filas finales: 51724
Rango temporal: 2020-07-02 00:00:00+00:00 → 2025-12-11 23:59:00+00:00
Guardado en: /workspaces/final-project/data/processed/unified_noticias_nlp_ready.csv


Unnamed: 0,publishedAt,text_nlp,source_url,axis
0,2025-11-12 03:31:05+00:00,"Bitcoin Bottomed At $98,000, Analyst Says: Tim...",Yahoo Entertainment,BTC
1,2025-11-12 21:31:08+00:00,Bitcoin User Accidentally Pays Over $105K To S...,Yahoo Entertainment,BTC
2,2025-11-12 19:00:00+00:00,Billionaire twins–backed stock surges 100% aft...,TheStreet,BTC
3,2025-11-12 19:01:08+00:00,"Take Profits On Bitcoin—It's 'Fall Season', Mo...",Yahoo Entertainment,BTC
4,2025-11-12 12:15:00+00:00,Demand Revival: Crypto Daybook Americas The da...,CoinDesk,BTC
5,2025-11-12 02:23:11+00:00,Asia Morning Briefing: Bitcoin ETFs Pull In $3...,CoinDesk,BTC
6,2025-11-12 18:42:46+00:00,Daily Deal: Costco 1-Year Gold Star Membership...,Techdirt,BTC
7,2025-11-12 21:38:20+00:00,A $25 Billion Bitcoin Bet Frays as Doubt Hits ...,Yahoo Entertainment,BTC
8,2025-11-12 02:17:22+00:00,The first partner of California donned a strik...,Daily Beast,BTC
9,2025-11-12 08:04:03+00:00,I'm convinced — Fiio FT7 is the flagship kille...,Android Central,BTC
