In [None]:
import gzip
import json
import pandas as pd
import wandb

In [None]:
# Initialize wandb run
run = wandb.init(
    project="book-recommendation", group="dev", job_type="eda", save_code=True
)

In [None]:
def sample_json_file(filepath: str, sample_size: int = 100_000) -> pd.DataFrame:
    """Sample a JSON lines file and return a DataFrame."""
    import random

    sampled_lines = []
    total_lines = 0

    print(f"Sampling {sample_size} lines from {filepath}...")

    print("Estimating total number of lines in the file...")
    with gzip.open(filepath, "rt") as f:
        for i, _ in enumerate(f):
            total_lines += 1
            if i > 1_000_000:  # Limit to first million lines for speed
                break

    # Calculate sampling probability
    if total_lines > sample_size:
        sample_prob = sample_size / total_lines
    else:
        sample_prob = 1.0

    print(
        f"Total lines estimated: {total_lines}. Sampling probability: {sample_prob:.6f}"
    )

    # Collect sampled
    with gzip.open(filepath, "rt") as f:
        for line in f:
            try:
                if random.random() < sample_prob:
                    sampled_lines.append(json.loads(line))
                    if len(sampled_lines) >= sample_size:
                        break
            except json.JSONDecodeError:
                continue

    print(f"Sampled {len(sampled_lines)} lines.")
    return pd.DataFrame(sampled_lines)

In [None]:
interactions_records = sample_json_file("../data/goodreads_interactions_dedup.json.gz")

In [None]:
interactions_df = pd.DataFrame(interactions_records)

In [None]:
interactions_df.head()

In [None]:
books_records = sample_json_file("../data/goodreads_books.json.gz")

In [None]:
books_df = pd.DataFrame(books_records)

In [None]:
books_df.head()

In [None]:
genres_records = sample_json_file(
    "../data/goodreads_book_genres_initial.json.gz", sample_size=10_000
)

In [None]:
genres_df = pd.DataFrame(genres_records)

In [None]:
genres_df.head()

In [None]:
author_records = sample_json_file(
    "../data/goodreads_book_authors.json.gz", sample_size=10_000
)

In [None]:
authors_df = pd.DataFrame(author_records)

In [None]:
authors_df.head()

In [None]:
def run_eda_on_sample(
    df: pd.DataFrame,
    numerical_cols: list[str] | None = None,
    categorical_cols: list[str] | None = None,
) -> None:
    """Run comprehensive EDA on a DataFrame."""
    import plotly.express as px
    import plotly.graph_objects as go

    print("=== EXPLORATORY DATA ANALYSIS ===")
    print(f"DataFrame shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")

    # Basic information
    print("\n--- DataFrame Info ---")
    print(df.info())

    # Missing values
    print("\n--- Missing Values ---")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame(
        {"Missing Count": missing, "Missing Percentage": missing_pct}
    )
    print(missing_df[missing_df["Missing Count"] > 0])

    # Unique values - handle unhashable types properly
    print("\n--- Unique Values ---")
    unique_counts = {}
    for col in df.columns:
        try:
            unique_counts[col] = df[col].nunique()
        except TypeError:
            # Handle unhashable types by converting to string first
            unique_counts[col] = (
                df[col]
                .apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
                .nunique()
            )

    print("Unique values per column:")
    for col, count in unique_counts.items():
        print(f"{col}: {count}")

    # Descriptive statistics - exclude columns with complex data structures
    print("\n--- Descriptive Statistics ---")
    simple_cols = []
    for col in df.columns:
        # Check if column contains lists or dicts
        if not df[col].apply(lambda x: isinstance(x, (list, dict))).any():
            simple_cols.append(col)

    if simple_cols:
        print(df[simple_cols].describe(include="all").T)
    else:
        print("No simple columns found for describe()")

    # Visualizations with Plotly
    numeric_cols = (
        df.select_dtypes(include=["number"]).columns.to_list()
        if numerical_cols is None
        else numerical_cols
    )
    categorical_cols = (
        df.select_dtypes(include=["object", "category"]).columns.to_list()
        if categorical_cols is None
        else categorical_cols
    )

    for col in numeric_cols:
        # Ensure the column is truly numeric
        plot_data = pd.to_numeric(df[col], errors="coerce").dropna()
        if len(plot_data) > 0:
            temp_df = pd.DataFrame({col: plot_data})
            fig = px.histogram(
                temp_df,
                x=col,
                nbins=20,
                title=f"Distribution of {col}",
                marginal="box",
                hover_data=[col],
            )
            fig.show()

            # Log to wandb
            run.log({f"histogram_{col}": fig})
        else:
            print(f"Skipping {col} - no valid numeric data")

    for col in categorical_cols:
        # Convert unhashable types to strings and filter out complex objects
        clean_data = df[col].apply(
            lambda x: str(x) if isinstance(x, (list, dict)) else x
        )
        clean_data = clean_data[clean_data.notna()]  # Remove NaN values

        if len(clean_data) > 0:
            top_categories = clean_data.value_counts().head(20)
            if len(top_categories) > 0:
                fig = px.bar(
                    x=top_categories.values,
                    y=top_categories.index,
                    orientation="h",
                    title=f"Top 20 Categories in {col}",
                    labels={"x": "Count", "y": col},
                )
                fig.update_layout(height=600)
                fig.show()

                # Log to wandb
                run.log({f"bar_chart_{col}": fig})
            else:
                print(f"Skipping {col} - no categories to plot")
        else:
            print(f"Skipping {col} - no data to plot")

In [None]:
run_eda_on_sample(
    interactions_df,
    categorical_cols=[
        "is_read",
        "rating",
    ],
)

In [None]:
run_eda_on_sample(
    books_df,
    numerical_cols=[
        "text_reviews_count",
        "average_rating",
        "num_pages",
        "ratings_count",
    ],
    categorical_cols=[
        "country_code",
        "language_code",
        "is_ebook",
        "format",
    ],
)

In [None]:
run_eda_on_sample(genres_df, numerical_cols=[], categorical_cols=[])

In [None]:
run_eda_on_sample(
    authors_df,
    numerical_cols=[
        "average_rating",
        "text_reviews_count",
        "ratings_count",
    ],
    categorical_cols=[],
)

In [None]:
run.finish()