This dataset documents the development of the audiobook market from 1998 through 2025, including planned releases. It contains information such as authors and release dates, providing a structured overview of key details across this period.
The dataset is designed to capture both foundational aspects and historical trends in audiobooks, with the intention of being expanded and updated with additional details over time.

In [182]:
import re
import numpy as np
import pandas as pd

In [183]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

def update_google_sheet(df, sheet_name, worksheet_name="Working Dataset", creds_file="credentials.json"):
    """
    Update a Google Sheet with the contents of a Pandas DataFrame.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset you want to upload.
    sheet_name : str
        The name of the Google Sheet (must already exist).
    worksheet_name : str, optional
        The name of the worksheet/tab inside the sheet (default: "Sheet1").
    creds_file : str, optional
        Path to your Google service account JSON credentials file (default: "credentials.json").
    
    Returns:
    --------
    None
    """
    # Define scope
    scope = ["https://spreadsheets.google.com/feeds",
             "https://www.googleapis.com/auth/drive"]

    # Load credentials
    creds = ServiceAccountCredentials.from_json_keyfile_name(creds_file, scope)
    client = gspread.authorize(creds)

    # Open the sheet and worksheet
    sheet = client.open(sheet_name).worksheet(worksheet_name)

    # Clear existing content
    sheet.clear()

    # Update with new data
    sheet.update([df.columns.values.tolist()] + df.values.tolist())

    print(f"✅ Google Sheet '{sheet_name}' → '{worksheet_name}' updated successfully!")

In [184]:
# Load dataset with explicit UTF-8 handling
df = pd.read_csv("audible_uncleaned.csv", encoding="utf-8", encoding_errors="replace")

In [185]:
# Ensure UTF-8 consistency and clean whitespace in 'name' column
df["name_normalized"] = (
    df["name"]
    .astype(str)                              # Ensure all values are strings
    .str.encode("utf-8", errors="ignore")     # Enforce UTF-8 encoding
    .str.decode("utf-8")
    .str.strip()                              # Trim leading/trailing whitespace
    .str.replace(r"\s+", " ", regex=True)    # Collapse multiple spaces
)

In [186]:
def clean_and_normalize_authors(text: str) -> str:
    if pd.isna(text):
        return pd.NA

    s = str(text)

    # 1) Remove constant prefix "Writtenby:"
    s = re.sub(r"^\s*Writtenby:\s*", "", s, flags=re.IGNORECASE)

    # 2) Insert spaces on lowercase → Uppercase transitions
    s = re.sub(r"(?<=[a-záéíóöőúüű])(?=[A-ZÁÉÍÓÖŐÚÜŰ])", " ", s)

    # 3) Normalize spacing around delimiters , & / +
    s = re.sub(r"\s*([,&/+])\s*", r" \1 ", s)

    # 4) Normalize 'and'
    s = re.sub(r"\s*(and)\s*", r" and ", s, flags=re.IGNORECASE)

    # 5) Collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()

    # 6) Split on multiple-author delimiters
    parts = re.split(r"\s*(?:,|&|and|/|\+)\s*", s, flags=re.IGNORECASE)

    # 7) Normalize each author name
    authors = []
    for part in parts:
        part = part.strip()
        if part:
            part = re.sub(r"\s+", " ", part)
            authors.append(part.title())

    # 8) Return as Google-Sheets-friendly string
    return "; ".join(authors) if authors else pd.NA


# Apply cleaning
df["author_normalized"] = df["author"].apply(clean_and_normalize_authors)

In [187]:
def clean_and_normalize_narrators(text: str) -> str:
    if pd.isna(text):
        return pd.NA

    s = str(text)

    # 1) Remove constant prefix "Narratedby:"
    s = re.sub(r"^\s*Narratedby:\s*", "", flags=re.IGNORECASE, string=s)

    # 2) Insert spaces on lowercase → Uppercase transitions
    s = re.sub(r"(?<=[a-záéíóöőúüű])(?=[A-ZÁÉÍÓÖŐÚÜŰ])", " ", s)

    # 3) Normalize spacing around delimiters , & / +
    s = re.sub(r"\s*([,&/+])\s*", r" \1 ", s)

    # 4) Normalize 'and'
    s = re.sub(r"\s*(and)\s*", r" and ", s, flags=re.IGNORECASE)

    # 5) Collapse multiple spaces and trim
    s = re.sub(r"\s+", " ", s).strip()

    # 6) Split on common multi-narrator delimiters
    parts = re.split(r"\s*(?:,|&|and|/|\+)\s*", s, flags=re.IGNORECASE)

    # 7) Normalize each narrator name
    narrators = []
    for part in parts:
        part = part.strip()
        if part:
            part = re.sub(r"\s+", " ", part)
            narrators.append(part.title())

    # 8) Return as a normalized list string (Google Sheets safe)
    return "; ".join(narrators) if narrators else pd.NA


# Apply to dataframe
df["narrator_normalized"] = df["narrator"].apply(clean_and_normalize_narrators)

In [188]:
def parse_time_to_minutes(text):
    if pd.isna(text):
        return np.nan

    s = str(text).lower().strip()

    # Extract hours & minutes using regex
    # Handles formats like:
    # "10 hrs and 5 mins", "10h 5m", "10 hours 5 minutes", "10:05"
    hours = 0
    minutes = 0

    # Case 1: HH:MM format
    time_match = re.search(r"(\d+)\s*:\s*(\d+)", s)
    if time_match:
        hours = int(time_match.group(1))
        minutes = int(time_match.group(2))
    else:
        # Case 2: Text-based format
        h_match = re.search(r"(\d+)\s*(hour|hours|hr|hrs|h)", s)
        m_match = re.search(r"(\d+)\s*(minute|minutes|min|mins|m)", s)

        if h_match:
            hours = int(h_match.group(1))

        if m_match:
            minutes = int(m_match.group(1))

        # Case 3: Only a single number (assume minutes)
        if not h_match and not m_match:
            num_match = re.search(r"\d+", s)
            if num_match:
                minutes = int(num_match.group())

    total_minutes = hours * 60 + minutes
    return total_minutes


# --- Convert time → total_minutes ---
df["total_minutes"] = df["time"].apply(parse_time_to_minutes)

# --- VALIDATION ---

# 1) Detect zero or negative values
df["time_invalid_zero_negative"] = df["total_minutes"] <= 0

# 2) Detect extreme values (customizable thresholds)
MIN_VALID_MINUTES = 5       # below this is likely an error
MAX_VALID_MINUTES = 3000    # 50+ hours is very unusual

df["time_invalid_extreme"] = (
    (df["total_minutes"] < MIN_VALID_MINUTES) |
    (df["total_minutes"] > MAX_VALID_MINUTES)
)

# 3) Combined validation flag
df["time_validation_flag"] = np.where(
    df["total_minutes"].isna(), "missing",
    np.where(
        df["time_invalid_zero_negative"], "zero_or_negative",
        np.where(
            df["time_invalid_extreme"], "extreme_value",
            "valid"
        )
    )
)

# --- ✅ DELETE ROWS WITH EXTREME VALUES ONLY ---
df = df[df["time_validation_flag"] != "extreme_value"].reset_index(drop=True)

In [189]:
df["language_normalized"] = (
    df["language"]
    .astype(str)
    .str.strip()          # Remove leading/trailing spaces
    .str.lower()          # Make everything lowercase first
    .str.capitalize()    # First letter uppercase, rest lowercase
)

# Restore NaN where original values were missing
df.loc[df["language"].isna(), "language_normalized"] = pd.NA

In [194]:
def parse_stars(value):
    """
    Parse strings like:
      "4.5 out of 5 stars (3 ratings)"
      "4.5 out of 5 stars 3 ratings"
      "Not rated yet"
    into: star_rating, max_rating, rating_count
    """
    if pd.isna(value):
        return pd.Series([np.nan, np.nan, np.nan])
    
    s = str(value).strip()

    # Handle "Not rated yet"
    if s.lower() == "not rated yet":
        return pd.Series([np.nan, np.nan, np.nan])

    # Default NaNs
    star_rating = np.nan
    max_rating = np.nan
    rating_count = np.nan

    # 1) Parse "X out of Y"
    m = re.search(r'(\d+(?:\.\d+)?)\s*out of\s*(\d+(?:\.\d+)?)', s, flags=re.IGNORECASE)
    if m:
        star_rating = float(m.group(1))
        max_rating = float(m.group(2))

    # 2) Parse rating count: "N rating(s)"
    m_count = re.search(r'(\d[\d,]*)\s*(rating|ratings)', s, flags=re.IGNORECASE)
    if m_count:
        rating_count = int(m_count.group(1).replace(",", ""))

    return pd.Series([star_rating, max_rating, rating_count])


# --- Apply parsing to create new columns ---

df[["star_rating", "max_rating", "rating_count"]] = df["stars"].apply(parse_stars)


# --- VALIDATION ---

# 1) Validate star_rating in [0, max_rating]
mask_valid_bounds = (
    df["star_rating"].notna() &
    df["max_rating"].notna()
)

df["star_rating_valid"] = True  # default True for missing
df.loc[mask_valid_bounds, "star_rating_valid"] = (
    (df.loc[mask_valid_bounds, "star_rating"] >= 0) &
    (df.loc[mask_valid_bounds, "star_rating"] <= df.loc[mask_valid_bounds, "max_rating"])
)

# 2) Validate rating_count is non-negative integer
# (we parsed as int, so just check non-negative; NaN allowed)
df["rating_count_valid"] = (
    df["rating_count"].isna() | (df["rating_count"] >= 0)
)

In [191]:
# --- 1) Clean and parse price column (with "Free" support) ---

def parse_price(value):
    """
    Clean price strings and convert to float.
    Handles:
      - "$9.99", "€ 1,234.50", "9 999 Ft"
      - "Free" -> 0.0
    """
    if pd.isna(value):
        return np.nan

    s = str(value).strip().lower()

    # Handle "Free"
    if s == "free":
        return 0.0

    # Remove currency symbols and any non-digit/.,- characters
    s = re.sub(r"[^0-9\.,-]", "", s)

    if s == "":
        return np.nan

    # If both comma and dot present -> assume comma is thousand sep
    if "," in s and "." in s:
        s = s.replace(",", "")
    else:
        # If only comma present -> assume decimal comma
        if "," in s and "." not in s:
            s = s.replace(",", ".")

    try:
        return float(s)
    except ValueError:
        return np.nan

# --- Apply parsing ---
df["price_numeric"] = df["price"].apply(parse_price)

# --- 2) Flag whether the item is free ---
df["price_is_free"] = df["price"].astype(str).str.strip().str.lower().eq("free")

# --- 3) Investigate integer vs non-integer prices ---

df["price_is_integer"] = df["price_numeric"].notna() & (
    (df["price_numeric"] % 1).abs() < 1e-9
)

any_non_integer = (~df["price_is_integer"] & df["price_numeric"].notna()).any()
print("Any non-integer prices?", any_non_integer)

# --- 4) Consistent display format (two decimals) ---

df["price_display"] = df["price_numeric"].apply(
    lambda x: f"{x:.2f}" if pd.notna(x) else ""
)

# --- 5) Flag extreme outliers (IQR method) ---

valid_prices = df["price_numeric"].dropna()

if len(valid_prices) > 0:
    Q1 = valid_prices.quantile(0.25)
    Q3 = valid_prices.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df["price_outlier"] = df["price_numeric"].notna() & (
        (df["price_numeric"] < lower_bound) |
        (df["price_numeric"] > upper_bound)
    )
else:
    df["price_outlier"] = False

print(len(df[df["price_outlier"] == True]))

print("Count of price outliers: " + str(len(df[df["price_outlier"] == True])))

# --- 6) Flag negative prices ---

df["price_negative"] = (
    df["price_numeric"].notna() &
    (df["price_numeric"] < 0)
)

Any non-integer prices? True
718
Count of price outliers: 718


In [192]:
import numpy as np

cols = [
    "price",
    "price_numeric",
    "price_is_free",
    "price_is_integer",
    "price_display",
    "price_outlier",
    "price_negative",
]

# Make a copy of just the columns you want to upload
df_gsheet = df[cols].copy()

# Replace NaN / inf with empty string (JSON-safe, Google Sheets-safe)
df_gsheet = df_gsheet.replace([np.nan, np.inf, -np.inf], "")

# Now upload this cleaned version
update_google_sheet(
    df_gsheet,
    sheet_name="Audable Web Scraped"
)


✅ Google Sheet 'Audable Web Scraped' → 'Working Dataset' updated successfully!


In [200]:
df_cleaned = pd.DataFrame()
df_cleaned = df[["name_normalized", "author_normalized", "narrator_normalized",
                 "total_minutes", "language_normalized", "star_rating",
                 "max_rating", "rating_count", "price_numeric"]]

df_cleaned.columns = [
    "Name",
    "Author",
    "Narrator",
    "TotalMinutes",
    "Language",
    "StarRating",
    "MaxRating",
    "RatingCount",
    "Price"
]

# Replace NaN / inf with empty string (JSON-safe, Google Sheets-safe)
df_cleaned = df_cleaned.replace([np.nan, np.inf, -np.inf], "")

In [201]:
update_google_sheet(df_cleaned, sheet_name="Audable Web Scraped")

✅ Google Sheet 'Audable Web Scraped' → 'Working Dataset' updated successfully!


In [206]:
df_cleaned.to_csv("audible_cleaned.csv", encoding="utf-8-sig", index=False)