In [2]:
import pandas as pd

In [None]:
# ---------- Data Loading ----------

def load_dataset_from_gdrive(file_id: str) -> pd.DataFrame:
    """Download a public CSV file from Google Drive **without** requiring `gdown`.

    Parameters
    ----------
    file_id : str
        The _alphanumeric_ part that appears after `id=` in a Google‑Drive share URL.

    Returns
    -------
    pd.DataFrame
        The CSV loaded into a pandas DataFrame.
    """
    url = f"https://drive.google.com/uc?export=download&id={file_id}"
    return pd.read_csv(url)

In [4]:
# ---------- Dataset Profiling ----------

def profile_dataset(df: pd.DataFrame, sample_size: int = 1):
    """Light‑weight profiler that infers column types and basic stats.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to profile.
    sample_size : int, optional
        *Reserved for future use* – how many example values to keep per column.

    Returns
    -------
    dict
        A JSON‑serialisable summary containing column names, inferred logical
        types, and simple quality metrics (missing‑rate, #unique, example value).
    """
    type_map = {}
    stats = {}

    for col in df.columns:
        dtype = df[col].dtype
        n_missing = df[col].isna().sum()
        total = len(df)
        n_unique = df[col].nunique()
        example = (
            df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        )

        # ---- heuristic type inference -------------------------------------
        if pd.api.types.is_numeric_dtype(dtype):
            inferred = "numeric"
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            inferred = "datetime"
        elif pd.api.types.is_object_dtype(dtype):
            if n_unique / total < 0.3 and n_unique < 100:
                inferred = "categorical"
            else:
                inferred = "text"
        else:
            inferred = "other"

        type_map[col] = inferred
        stats[col] = {
            "missing_rate": round(n_missing / total, 4),
            "n_unique": int(n_unique),
            "example": example,
        }

    return {"columns": list(df.columns), "types": type_map, "stats": stats}

In [6]:
# ---------- Quick demo -----------------------------------------------------

if __name__ == "__main__":
    # Google‑Drive file ID for the dataset the user provided
    # https://drive.google.com/file/d/1iggV9et6qdhJyxTcEg32mVBeJf3D52LZ/view?usp=sharing
    
    gdrive_file_id = "https://drive.google.com/uc?export=download&id=1iggV9et6qdhJyxTcEg32mVBeJf3D52LZ"

    # 1️⃣  Load the CSV directly from Drive
    df = pd.read_csv(gdrive_file_id)

    # 2️⃣  Profile the dataset
    profile = profile_dataset(df)

    # 3️⃣  Pretty‑print a compact summary
    import json, textwrap

    print("\nDataset profile summary:\n")
    print(textwrap.indent(json.dumps(profile, indent=2, default=str), "  "))


Dataset profile summary:

  {
    "columns": [
      "Quarter",
      "number_Customers",
      "Total_Transactions",
      "Revenue",
      "Profit"
    ],
    "types": {
      "Quarter": "text",
      "number_Customers": "text",
      "Total_Transactions": "text",
      "Revenue": "text",
      "Profit": "text"
    },
    "stats": {
      "Quarter": {
        "missing_rate": 0.0,
        "n_unique": 16,
        "example": "2016Q1"
      },
      "number_Customers": {
        "missing_rate": 0.0,
        "n_unique": 16,
        "example": "2,943"
      },
      "Total_Transactions": {
        "missing_rate": 0.0,
        "n_unique": 16,
        "example": "3,683"
      },
      "Revenue": {
        "missing_rate": 0.0,
        "n_unique": 16,
        "example": "396,436"
      },
      "Profit": {
        "missing_rate": 0.0,
        "n_unique": 16,
        "example": "189,057"
      }
    }
  }
