<a href="https://colab.research.google.com/github/aliasoblomov/huggingface-to-bigquery-transfer/blob/main/tansfer_divar_data_from_huggingface_to_bigquery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CELL 1: Install Libraries

!pip install datasets pandas google-cloud-bigquery pyarrow db-dtypes -q

In [None]:
# CELL 2: Authenticate to Google Cloud
from google.colab import auth
auth.authenticate_user()
print('✅ Authenticated')


In [None]:
# CELL 3: Download from HF & Load into BigQuery (with retries)

# === CONFIGURATION: REPLACE THESE ===
gcp_project_id = "azw-ua"      # ← your GCP project ID
bq_dataset_id  = "real_estate_data"     # ← your existing BigQuery dataset
bq_table_id    = "divar_real_estate_ads"   # ← name for the new table
hf_dataset     = "divaroffical/real_estate_ads"
hf_split       = "train"
bq_location    = "US"                      # ← match your dataset location
# ===================================

import time
import pandas as pd
from datasets import load_dataset
from google.cloud import bigquery

# Full table reference
table_ref = f"{gcp_project_id}.{bq_dataset_id}.{bq_table_id}"

print(f"→ HF dataset: {hf_dataset} [{hf_split}]")
print(f"→ BQ table:   {table_ref}  (location={bq_location})\n")

# 1) Download HF dataset
print("1) Downloading Hugging Face dataset…")
hf_ds = load_dataset(hf_dataset, split=hf_split)
df    = hf_ds.to_pandas()
print(f"   → Downloaded & converted to DataFrame: {df.shape[0]} rows, {df.shape[1]} cols\n")

# 2) Initialize BQ client
client = bigquery.Client(project=gcp_project_id, location=bq_location)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    autodetect=True,
)

# 3) Upload with retries
max_retries = 5
for attempt in range(1, max_retries+1):
    try:
        print(f"{attempt=}: Starting load_job…")
        job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
        job.result()  # wait for completion
        print(f"✅ Loaded {job.output_rows} rows into {table_ref}")
        break
    except Exception as err:
        print(f"❌ Attempt {attempt} failed: {err}")
        if attempt == max_retries:
            raise RuntimeError("All retries failed—aborting.") from err
        backoff = 2 ** attempt
        print(f"   ↳ retrying in {backoff}s…")
        time.sleep(backoff)

print("\n🎉 All done!")
