<a href="https://colab.research.google.com/github/alyssa-tsh/CS3244_ML_Project/blob/main/reuters_dataset_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import requests
import pandas as pd
import time
from tqdm import tqdm
from IPython.display import display
from datetime import datetime
from collections import Counter
import ast

# ------------------------
# Function: fetch all rows for a split
# ------------------------
def fetch_reuters_split(dataset="rjjan/reuters21578",
                        config="ModApte",
                        split="train",
                        batch_size=100):
    """
    Fetch all rows from a Hugging Face dataset split via the API.
    Returns a pandas DataFrame.
    """

    all_rows = []
    offset = 0

    # First request to determine total rows
    url = "https://datasets-server.huggingface.co/rows"
    params = {"dataset": dataset, "config": config, "split": split, "offset": 0, "length": 1}

    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()

    total_rows = data.get("num_rows_total", 55737)  # fallback
    print(f"Total rows in {split} split: {total_rows}")

    pbar = tqdm(total=total_rows, desc=f"Fetching {split}")

    while offset < total_rows:
        try:
            params = {
                "dataset": dataset,
                "config": config,
                "split": split,
                "offset": offset,
                "length": min(batch_size, total_rows - offset)
            }

            resp = requests.get(url, params=params)
            resp.raise_for_status()
            batch_data = resp.json()

            batch_rows = [r["row"] for r in batch_data["rows"]]
            all_rows.extend(batch_rows)

            offset += len(batch_rows)
            pbar.update(len(batch_rows))

            time.sleep(0.2)  # small pause
        except Exception as e:
            print(f"Error at offset {offset}: {e}, retrying in 5s...")
            time.sleep(5)
            continue

    pbar.close()

    df_split = pd.DataFrame(all_rows)
    df_split["split"] = split

    return df_split

# ------------------------
# Step 1: Fetch train and test individually
# ------------------------
print("Fetching TRAIN split...")
df_train = fetch_reuters_split(split="train")
print(f"\nâœ… TRAIN split fetched: {len(df_train)} rows\n")

print("Fetching TEST split...")
df_test = fetch_reuters_split(split="test")
print(f"\nâœ… TEST split fetched: {len(df_test)} rows\n")

# ------------------------
# Step 2: Save each split individually
# ------------------------
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

train_csv = f"reuters21578_train_{timestamp}.csv"
test_csv = f"reuters21578_test_{timestamp}.csv"
df_train.to_csv(train_csv, index=False)
df_test.to_csv(test_csv, index=False)
print(f"ðŸ’¾ Saved TRAIN split to {train_csv}")
print(f"ðŸ’¾ Saved TEST split to {test_csv}")

# Also save as parquet for faster future use
train_parquet = f"reuters21578_train_{timestamp}.parquet"
test_parquet = f"reuters21578_test_{timestamp}.parquet"
df_train.to_parquet(train_parquet, index=False)
df_test.to_parquet(test_parquet, index=False)
print(f"ðŸ’¾ Saved TRAIN split as parquet: {train_parquet}")
print(f"ðŸ’¾ Saved TEST split as parquet: {test_parquet}")

# ------------------------
# Step 3: Keep train/test DataFrames in memory
# ------------------------
print("\nDataFrames available in memory: df_train, df_test")
display(df_train.head())
display(df_test.head())

# ------------------------
# Step 4: Optional - combine into full dataset
# ------------------------
df_full = pd.concat([df_train, df_test], ignore_index=True)
full_csv = f"reuters21578_full_{timestamp}.csv"
df_full.to_csv(full_csv, index=False)
print(f"\nðŸ’¾ Combined full dataset saved to {full_csv}")


Fetching TRAIN split...
Total rows in train split: 9603


Fetching train:  58%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š    | 5600/9603 [00:29<00:24, 165.26it/s]

Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...


Fetching train:  58%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š    | 5600/9603 [00:41<00:24, 165.26it/s]

Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=train&offset=5600&length=100, retrying in 5s...
Error at offset 5600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=tra

Fetching train: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 9603/9603 [02:04<00:00, 77.21it/s] 



âœ… TRAIN split fetched: 9603 rows

Fetching TEST split...
Total rows in test split: 3299


Fetching test:  48%|â–ˆâ–ˆâ–ˆâ–ˆâ–Š     | 1600/3299 [00:08<00:09, 181.62it/s]

Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...


Fetching test:  48%|â–ˆâ–ˆâ–ˆâ–ˆâ–Š     | 1600/3299 [00:26<00:09, 181.62it/s]

Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&offset=1600&length=100, retrying in 5s...
Error at offset 1600: 429 Client Error: Too Many Requests for url: https://datasets-server.huggingface.co/rows?dataset=rjjan%2Freuters21578&config=ModApte&split=test&of

Fetching test: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3299/3299 [01:42<00:00, 32.10it/s]



âœ… TEST split fetched: 3299 rows

ðŸ’¾ Saved TRAIN split to reuters21578_train_20260212_090227.csv
ðŸ’¾ Saved TEST split to reuters21578_test_20260212_090227.csv
ðŸ’¾ Saved TRAIN split as parquet: reuters21578_train_20260212_090227.parquet
ðŸ’¾ Saved TEST split as parquet: reuters21578_test_20260212_090227.parquet

DataFrames available in memory: df_train, df_test


Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title,split
0,"Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n The dry period means the tempor...","""NORM""",[cocoa],"""TRAIN""","""TRAINING-SET""","""5544""","""1""","[el-salvador, usa, uruguay]",[],[],[],26-FEB-1987 15:01:01.79,BAHIA COCOA REVIEW,train
1,The U.S. Agriculture Department\nreported the farmer-owned reserve national five-day average\nprice through February 25 as follows (Dlrs/Bu-Sorghum Cwt) -\n Natl Loan Release Call\n Avge Rate-X Level Price Price\n Wheat 2.55 2.40 IV 4.65 --\...,"""NORM""","[grain, wheat, corn, barley, oat, sorghum]","""TRAIN""","""TRAINING-SET""","""5548""","""5""",[usa],[],[],[],26-FEB-1987 15:10:44.60,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,train
2,"Argentine grain board figures show\ncrop registrations of grains, oilseeds and their products to\nFebruary 11, in thousands of tonnes, showing those for futurE\nshipments month, 1986/87 total and 1985/86 total to February\n12, 1986, in brackets:\n Bread wheat prev 1,655.8, Feb 872.0, March 16...","""NORM""","[veg-oil, linseed, lin-oil, soy-oil, sun-oil, soybean, oilseed, corn, sunseed, grain, sorghum, wheat]","""TRAIN""","""TRAINING-SET""","""5549""","""6""",[argentina],[],[],[],26-FEB-1987 15:14:36.41,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,train
3,Moody's Investors Service Inc said it\nlowered the debt and preferred stock ratings of USX Corp and\nits units. About seven billion dlrs of securities is affected.\n Moody's said Marathon Oil Co's recent establishment of up\nto one billion dlrs in production payment facilities on its\nprolifi...,"""NORM""",[],"""TRAIN""","""TRAINING-SET""","""5551""","""8""",[usa],[],[],[],26-FEB-1987 15:15:40.12,USX &lt;X> DEBT DOWGRADED BY MOODY'S,train
4,"Champion Products Inc said its\nboard of directors approved a two-for-one stock split of its\ncommon shares for shareholders of record as of April 1, 1987.\n The company also said its board voted to recommend to\nshareholders at the annual meeting April 23 an increase in the\nauthorized capit...","""NORM""",[earn],"""TRAIN""","""TRAINING-SET""","""5552""","""9""",[usa],[],[],[],26-FEB-1987 15:17:11.20,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT,train


Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title,split
0,"Mounting trade friction between the\nU.S. And Japan has raised fears among many of Asia's exporting\nnations that the row could inflict far-reaching economic\ndamage, businessmen and officials said.\n They told Reuter correspondents in Asian capitals a U.S.\nMove against Japan might boost pro...","""NORM""",[trade],"""TEST""","""TRAINING-SET""","""3809""","""14826""","[hong-kong, usa, japan, taiwan, malaysia, south-korea, australia]",[],[],[],8-APR-1987 01:03:47.52,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT,test
1,"A survey of 19 provinces and seven cities\nshowed vermin consume between seven and 12 pct of China's grain\nstocks, the China Daily said.\n It also said that each year 1.575 mln tonnes, or 25 pct, of\nChina's fruit output are left to rot, and 2.1 mln tonnes, or up\nto 30 pct, of its vegetable...","""NORM""",[grain],"""TEST""","""TRAINING-SET""","""3811""","""14828""",[china],[],[],[],8-APR-1987 01:19:17.29,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS,test
2,"The Ministry of International Trade and\nIndustry (MITI) will revise its long-term energy supply/demand\noutlook by August to meet a forecast downtrend in Japanese\nenergy demand, ministry officials said.\n MITI is expected to lower the projection for primary energy\nsupplies in the year 2000...","""NORM""","[crude, nat-gas]","""TEST""","""TRAINING-SET""","""4356""","""14829""",[japan],[],[],[],8-APR-1987 01:22:17.25,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS,test
3,"Thailand's trade deficit widened to 4.5\nbillion baht in the first quarter of 1987 from 2.1 billion a\nyear ago, the Business Economics Department said.\n It said Janunary/March imports rose to 65.1 billion baht\nfrom 58.7 billion. Thailand's improved business climate this\nyear resulted in a...","""NORM""","[trade, grain, rice, corn, sugar, tin, rubber]","""TEST""","""TRAINING-SET""","""3815""","""14832""",[thailand],[],[],[],8-APR-1987 01:45:09.09,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER,test
4,"Indonesia expects crude palm oil (CPO)\nprices to rise sharply to between 450 and 550 dlrs a tonne FOB\nsometime this year because of better European demand and a fall\nin Malaysian output, Hasrul Harahap, junior minister for tree\ncrops, told Indonesian reporters.\n Prices of Malaysian and S...","""NORM""","[veg-oil, palm-oil]","""TEST""","""TRAINING-SET""","""3816""","""14833""","[indonesia, malaysia]",[],[],[],8-APR-1987 01:48:20.11,INDONESIA SEES CPO PRICE RISING SHARPLY,test



ðŸ’¾ Combined full dataset saved to reuters21578_full_20260212_090227.csv
