In [1]:
import requests
import random

# Define the API endpoint and parameters
url = "https://datasets-server.huggingface.co/rows"
params = {
    "dataset": "nyu-dice-lab/wavepulse-radio-raw-transcripts",
    "config": "default",
    "split": "train",
    "offset": 0,
    "length": 100  # Fetch 100 rows per request
}

# Function to get the total number of rows in the dataset
def get_total_rows():
    try:
        # Make a request to fetch metadata about the dataset
        response = requests.get(url, params={**params, "offset": 0, "length": 1})
        response.raise_for_status()  # Raise an error for bad status codes
        data = response.json()
        
        # Check if the response contains the total number of rows
        if "num_rows_total" in data:
            return data["num_rows_total"]
        else:
            print("Error: 'num_rows_total' not found in the API response.")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data: {e}")
        return None

# Initialize an empty list to store all rows
all_rows = []

# Get the total number of rows in the dataset
total_rows = get_total_rows()
if total_rows is None:
    print("Unable to determine the total number of rows in the dataset. Exiting.")
    exit()

# Number of random rows to fetch
num_random_rows = 1000  # Adjust this as needed

# Calculate the number of requests needed
num_requests = (num_random_rows + params["length"] - 1) // params["length"]

# Fetch random rows in chunks of 100
for _ in range(num_requests):
    # Generate a random offset, ensuring it doesn't exceed the dataset's bounds
    random_offset = random.randint(0, max(0, total_rows - params["length"]))
    params["offset"] = random_offset
    
    # Make the GET request to the API
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data: {e}")
        continue
    
    # Parse the JSON response
    data = response.json()
    
    # Extract the rows from the response
    if "rows" in data:
        rows = [row['row'] for row in data['rows']]
        all_rows.extend(rows)
        print(f"Fetched {len(rows)} rows starting at offset {random_offset}")
    else:
        print("Error: 'rows' not found in the API response.")
        continue

    # Stop if we've fetched enough rows
    if len(all_rows) >= num_random_rows:
        break

# Trim the list if we fetched more rows than needed
all_rows = all_rows[:num_random_rows]

# Now `all_rows` contains the desired number of randomly selected rows
print(f"Total rows fetched: {len(all_rows)}")

Failed to fetch data: 502 Server Error: Bad Gateway for url: https://datasets-server.huggingface.co/rows?dataset=nyu-dice-lab%2Fwavepulse-radio-raw-transcripts&config=default&split=train&offset=105276740&length=100
Fetched 100 rows starting at offset 560049537
Fetched 100 rows starting at offset 344609279
Fetched 100 rows starting at offset 56990448
Fetched 100 rows starting at offset 82007761
Fetched 100 rows starting at offset 432663039
Fetched 100 rows starting at offset 214045419
Fetched 100 rows starting at offset 409570493
Fetched 100 rows starting at offset 111369259
Fetched 100 rows starting at offset 448167349
Total rows fetched: 900


In [3]:
import pandas as pd
# Convert the list of rows to a Pandas DataFrame
df = pd.DataFrame(all_rows)

# Display the first few rows of the DataFrame
print(df.head())
print(f"Total rows in the dataset: {len(df)}")

              transcript_id  segment_index  start_time  end_time  \
0  WV_WSCW_2024_11_16_01_00            410    1779.612  1781.493   
1  WV_WSCW_2024_11_16_01_00            411    1781.713  1784.234   
2  WV_WSCW_2024_11_16_01_00            412    1784.294  1785.214   
3  WV_WSCW_2024_11_16_01_00            413    1785.254  1787.235   
4  WV_WSCW_2024_11_16_01_00            414    1787.655  1793.422   

                                                text station  \
0  No, no, the point is you can't talk to a liberal.    WSCW   
1  You've been lying so long you don't know how t...    WSCW   
2                                    Time to decide.    WSCW   
3                        Donald Trump for president!    WSCW   
4   The difference between Biden and Trump is tha...    WSCW   

              datetime state     speaker  
0  2024-11-16T01:00:00    WV  SPEAKER_02  
1  2024-11-16T01:00:00    WV  SPEAKER_02  
2  2024-11-16T01:00:00    WV  SPEAKER_02  
3  2024-11-16T01:00:00    WV  SPEA

In [4]:
# Get the unique states from the DataFrame
unique_states = df['state'].unique()

# Display the unique states
print("Unique states in the dataset:", unique_states)

Unique states in the dataset: ['WV' 'NH' 'CA' 'DC' 'PA' 'KS' 'OK' 'FL']


In [5]:
# Save the DataFrame to a CSV file
df.to_csv('random_radio_raw_transcripts.csv', index=False)

print("Dataset has been downloaded as 'random_radio_raw_transcripts.csv'")

Dataset has been downloaded as 'random_radio_raw_transcripts.csv'
