In [1]:
import pandas as pd
from tqdm import tqdm
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from newspaper import Article
import requests
from bs4 import BeautifulSoup


df = pd.read_csv("es5k.csv")  
urls = df["url"].dropna().unique().tolist()

checkpoint_file = "checkpoint_es.csv"

if os.path.exists(checkpoint_file):
    done_urls = pd.read_csv(checkpoint_file)["url"].tolist()
    urls = [u for u in urls if u not in done_urls]
    print(f"Resuming from checkpoint. Remaining: {len(urls)}")
else:
    print(f"Starting fresh. Total: {len(urls)}")


def fallback_scraper(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')

        title_tag = soup.find('h1') or soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else ""

        ps = soup.find_all('p')
        content = "\n".join([p.get_text(strip=True) for p in ps if len(p.get_text(strip=True)) > 10])
        return title, content
    except:
        return "", ""

def extract_article_text_and_title(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        title = article.title or ""
        text = article.text or ""
        
        if len(text) < 100:
            f_title, f_text = fallback_scraper(url)
            title = title if len(title) > 3 else f_title
            text = f_text if len(f_text) > 100 else text

        return url, title, text
    except:
        return url, "", ""

# ------------  concurrence
results = []
batch_size = 500

with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(extract_article_text_and_title, url): url for url in urls}
    for i, future in enumerate(tqdm(as_completed(future_to_url), total=len(urls))):
        results.append(future.result())
        
        if (i + 1) % batch_size == 0 or (i + 1) == len(urls):
            df_temp = pd.DataFrame(results, columns=["url", "title", "content"])
            df_temp.to_csv(checkpoint_file, index=False)
            print(f"Saved {len(results)} articles to {checkpoint_file}")

# ------------ final
df_final = pd.DataFrame(results, columns=["url", "title", "content"])
df_final.to_csv("es5k_with_content.csv", index=False)
print("All done. Saved full data to es5k_with_content.csv")


Starting fresh. Total: 5000


 10%|████                                    | 501/5000 [00:55<06:15, 11.98it/s]

Saved 500 articles to checkpoint_es.csv


 20%|███████▉                                | 999/5000 [01:48<08:36,  7.74it/s]

Saved 1000 articles to checkpoint_es.csv


 30%|███████████▋                           | 1501/5000 [02:44<08:02,  7.25it/s]

Saved 1500 articles to checkpoint_es.csv


 40%|███████████████▌                       | 2003/5000 [03:49<07:37,  6.55it/s]

Saved 2000 articles to checkpoint_es.csv


 50%|███████████████████▌                   | 2501/5000 [04:56<03:51, 10.78it/s]

Saved 2500 articles to checkpoint_es.csv


 60%|███████████████████████▍               | 3004/5000 [05:59<03:28,  9.59it/s]

Saved 3000 articles to checkpoint_es.csv


 70%|███████████████████████████▎           | 3501/5000 [07:01<03:31,  7.08it/s]

Saved 3500 articles to checkpoint_es.csv


 80%|███████████████████████████████▏       | 4003/5000 [08:00<01:37, 10.27it/s]

Saved 4000 articles to checkpoint_es.csv


 90%|███████████████████████████████████    | 4500/5000 [08:56<01:30,  5.53it/s]

Saved 4500 articles to checkpoint_es.csv


100%|███████████████████████████████████████| 5000/5000 [10:01<00:00,  8.32it/s]

Saved 5000 articles to checkpoint_es.csv
All done. Saved full data to es5k_with_content.csv





In [5]:
df = pd.read_csv("es5k_with_content.csv")
df.iloc[4]  


url        https://eldia.com.do/autoferia-popular-concluy...
title      Autoferia Popular concluye con RD$17,000 millo...
content    Santo Domingo. – La vigésima novena Autoferia ...
Name: 4, dtype: object

In [21]:
df = pd.read_csv("es5k_with_content.csv")
df.head(5)


Unnamed: 0,url,title,content
0,https://www.apfdigital.com.ar/noticias/2024/12...,El IAPV abordó las nuevas operatorias de vivie...,Las nuevas operatorias de viviendas fueron el ...
1,https://www.nvinoticias.com/cultura/el-lector-...,El lector furtivo / Boy. Relatos de infancia,Por Rafael Alfonso\n\nDespués de una larga est...
2,https://www.cronicaviva.com.pe/peru-economia-c...,"Perú: economía crecería 3.1% en 2024, con défi...","La economía peruana avanzaría 3.1% en 2024, pr..."
3,https://eldia.com.do/persio-maldonado-critica-...,Persio Maldonado critica poderes del Estado se...,Santo Domingo.-El presidente de la Sociedad Do...
4,https://eldia.com.do/autoferia-popular-concluy...,"Autoferia Popular concluye con RD$17,000 millo...",Santo Domingo. – La vigésima novena Autoferia ...


In [13]:
print(df.iloc[10]['title'])
print(df.iloc[10]['content'][:1500])  # 前500个字符


Cuadro preocupante en disposición de neumáticos
Sun and clouds mixed. High 88F. Winds E at 15 to 25 mph..
Partly cloudy. Low 77F. Winds ESE at 10 to 15 mph.
Actualizado: May 22, 2025 @ 9:20 am
El DRNA tampoco tiene cifras actualizadas sobre la importación de los neumáticos y los ingresos que genera. >Suministrada
La Oficina del Procurador del Ciudadano (Ombudsman) informó que, tras una investigación, se descubrió un cuadro preocupante sobre el manejo del Departamento de Recursos Naturales y Ambientales (DRNA) en la acumulación de neumáticos desechados, principalmente en el centro de Puerto Rico.
El DRNA tampoco tiene cifras actualizadas sobre la importación de los neumáticos y los ingresos que genera, reveló el ombudsman Edwin García Feliciano, quien precisó que el periodo de investigación cubrió de 2022 a junio de 2024 y abarcó visitas y entrevistas en más de 100 instalaciones de venta de neumáticos y empresas responsables del proceso de disposición en todo Puerto Rico.
Los hallazgos 

In [23]:

df_es = pd.read_csv("es5k_with_content.csv")
df_es.head()
# non-null
print("Total records:", len(df_es))
print("Non-empty content:", df_es['content'].notna().sum())
print("Content length stats:")
print(df_es['content'].str.len().describe())

# filter out content less than 300 char，including null
df_clean_es = df_es[df_es['content'].str.len() > 300]
print("Remaining after filtering:", len(df_clean_es))


Total records: 5000
Non-empty content: 4005
Content length stats:
count     4005.000000
mean      3464.808989
std       3384.769839
min         78.000000
25%       1842.000000
50%       2739.000000
75%       4220.000000
max      84693.000000
Name: content, dtype: float64
Remaining after filtering: 3858
