In [1]:
import pandas as pd
import sys

def analyze_dataset(file_path, grouping_columns=["page_id", "ad_id"], max_columns=10, max_groups=3):
    try:
        # Load dataset (universal for any CSV)
        df = pd.read_csv(file_path)
        
        # Limit to 10 columns for manageability
        columns = df.columns[:max_columns]
        df_limited = df[columns]
        
        # Per-column statistics
        print("=== Per-Column Statistics ===")
        numeric_stats = df_limited.describe()
        print(numeric_stats)  # Count, mean, std, min, max for numeric
        for col in df_limited.select_dtypes(include=['object']):
            print(f"\n{col}:")
            most_frequent = df_limited[col].value_counts().idxmax()
            most_frequent_count = df_limited[col].value_counts().max()
            print(f"  Most Frequent: {most_frequent} (Count: {most_frequent_count})")
            print(f"  Count: {len(df_limited[col])}")
            print(f"  Unique Values: {df_limited[col].nunique()}")
        
        # Aggregation by specified columns
        print(f"\n=== Aggregated by {', '.join(grouping_columns)} ===")
        grouped = df_limited.groupby(list(grouping_columns)).describe()
        for i, (index, row) in enumerate(grouped.iterrows()):
            if i >= max_groups:
                break
            print(f"\nGroup: {', '.join(f'{col}={val}' for col, val in zip(grouping_columns, index))}")
            for col in columns:
                if col in grouped.columns.droplevel(1):
                    stats = grouped[col].loc[index]
                    print(f"  Column: {col} (Numeric)")
                    print(f"    Count: {int(stats['count'])}")
                    print(f"    Mean: {stats['mean']:.2f}")
                    print(f"    Min: {stats['min']:.2f}")
                    print(f"    Max: {stats['max']:.2f}")
                    print(f"    Std: {stats['std']:.2f}" if not pd.isna(stats['std']) else "    Std: None")
                else:
                    values = df_limited[(df_limited[grouping_columns[0]] == index[0]) & 
                                      (df_limited[grouping_columns[1]] == index[1])][col]
                    print(f"  Column: {col} (Categorical)")
                    print(f"    Count: {len(values)}")
                    print(f"    Unique Values: {values.nunique()}")
                    print(f"    Most Frequent: {values.mode()[0]} (Count: {values.value_counts().max()})")
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)

if __name__ == "__main__":
    dataset_path = "2024_fb_ads_president_scored_anon.csv"
    analyze_dataset(dataset_path)

=== Per-Column Statistics ===
       estimated_audience_size  estimated_impressions  estimated_spend
count             2.467450e+05          246745.000000    246745.000000
mean              5.564629e+05           45601.525952      1061.291434
std               4.098648e+05          136790.769901      4992.560749
min               0.000000e+00             499.000000        49.000000
25%               7.500000e+04             499.000000        49.000000
50%               3.000000e+05            3499.000000        49.000000
75%               1.000001e+06           22499.000000       449.000000
max               1.000001e+06         1000000.000000    474999.000000

page_id:
  Most Frequent: 4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d (Count: 55503)
  Count: 246745
  Unique Values: 4475

ad_id:
  Most Frequent: 0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc (Count: 1)
  Count: 246745
  Unique Values: 246745

ad_creation_time:
  Most Frequent: 2024-10-2