In [None]:
# # ================================================================
# # 🐼 PANDAS MEGA PRACTICE NOTEBOOK (0 → Beginner → Intermediate → Advanced)
# # One-cell, comment/uncomment based learning plan (Colab-friendly)
# # Author: Your Pandas Buddy
# # ================================================================

# # =========================
# # 0) SETUP & DATA LOADING
# # =========================
# # Colab me ye libs usually preinstalled hoti hain; safety ke liye:
# # !pip -q install pandas pyarrow fastparquet openpyxl lxml xlrd sqlalchemy duckdb sqlite-utils

import os, io, json, math, random, textwrap, duckdb, sqlite3, numpy as np, pandas as pd
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)
print("✅ pandas:", pd.__version__)


# import pandas as pd

# ==== GitHub RAW URLs ====
base = "https://raw.githubusercontent.com/theabhinaykumar/csv/main/"

user_reviews   = base + "user_reviews.csv"
apps           = base + "apps.csv"
db1_sales      = base + "DB1%20sales.csv"
hospital       = base + "hospital.csv"
ab_nyc_2019    = base + "AB_NYC_2019.csv"
creditcard     = base + "creditcard.csv"
creditcard_1   = base + "creditcard%201.csv"
ny_367k_emails = base + "New%20York%203,67,000%2B%20Email.csv.xlsx"  # Excel file

# ==== Load CSVs ====
df_reviews   = pd.read_csv(user_reviews)
df_apps      = pd.read_csv(apps)
df_db1sales  = pd.read_csv(db1_sales)
df_hospital  = pd.read_csv(hospital)
df_abnyc     = pd.read_csv(ab_nyc_2019)
df_credit    = pd.read_csv(creditcard)
df_credit_1  = pd.read_csv(creditcard_1)

# ==== Load Excel ====
df_ny_emails = pd.read_excel(ny_367k_emails, engine="openpyxl")

# ==== Check data ====
print(df_reviews.shape)
print(df_apps.shape)
print(df_hospital.shape)
print(df_abnyc.shape)
print(df_credit.shape)
print(df_credit_1.shape)
print(df_ny_emails.shape)

# Peek data
print(df_reviews.head())



# # ---- Choose your data SOURCE ----
# # "github" → public RAW CSV URL
# # "local_upload" → manual upload from your computer
# # "drive" → file path in mounted Google Drive
# SOURCE = "github"   # ← change to: "local_upload" or "drive"

# # Public small dataset (you can replace with your own RAW link)
# GITHUB_RAW_CSV_URL = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"  # sample

# # Google Drive path (change if using "drive")
# DRIVE_CSV_PATH = "/content/drive/MyDrive/data/myfile.csv"

# def load_from_github(url: str) -> pd.DataFrame:
#     """
#     GitHub CSV load via pandas over HTTPS
#     """
#     df = pd.read_csv(url)
#     return df

# def load_from_local_upload() -> pd.DataFrame:
#     """
#     Upload CSV from your laptop into Colab.
#     """
#     from google.colab import files
#     up = files.upload()
#     if not up:
#         raise RuntimeError("No file uploaded.")
#     fname = list(up.keys())[0]
#     print("Uploaded:", fname)
#     df = pd.read_csv(fname)  # try default; adjust with sep=',' / encoding if needed
#     return df

# def load_from_drive(path: str) -> pd.DataFrame:
#     """
#     Read CSV from mounted Drive path
#     """
#     if not os.path.isdir("/content/drive"):
#         from google.colab import drive
#         drive.mount("/content/drive", force_remount=True)
#     df = pd.read_csv(path)
#     return df

# # ---- Load main DataFrame: df ----
# if SOURCE == "github":
#     df = load_from_github(GITHUB_RAW_CSV_URL)
# elif SOURCE == "local_upload":
#     df = load_from_local_upload()
# elif SOURCE == "drive":
#     df = load_from_drive(DRIVE_CSV_PATH)
# else:
#     raise ValueError("SOURCE must be 'github', 'local_upload', or 'drive'")

# print("\n=== DATA PREVIEW ===")
# print("Shape:", df.shape)        # (rows, cols)
# print("Columns:", list(df.columns))
# display(df.head())              # top 5
# display(df.sample(min(5, len(df))))  # random sample (up to 5)


# # =========================================================
# # DAY 1 — BASICS: Inspect, Select, Filter, Sort, New Columns
# # =========================================================
# # UNCOMMENT to practice Day-1
# # """
# print("\n================= DAY 1 =================")

# # 1.1 Inspect
# print("\n-- Inspect --")
# print(df.info())
# display(df.describe(include="all"))
# display(df.nunique())

# # 1.2 Column selection
# print("\n-- Column selection --")
# some_cols = list(df.columns)[:3]
# display(df[some_cols].head())

# # 1.3 Row filtering (boolean indexing)
# print("\n-- Filtering (example) --")
# num_col = None
# for c in df.columns:
#     if pd.api.types.is_numeric_dtype(df[c]):
#         num_col = c
#         break
# if num_col:
#     avg_val = df[num_col].mean()
#     display(df[df[num_col] > avg_val].head())

# # 1.4 Sort values
# print("\n-- Sort values --")
# if num_col:
#     display(df.sort_values(by=num_col, ascending=False).head())

# # 1.5 Create/transform columns with assign
# print("\n-- assign() demo --")
# if num_col:
#     df1 = (df
#            .assign(is_big = lambda d: d[num_col] > d[num_col].mean(),
#                    zscore = lambda d: (d[num_col] - d[num_col].mean()) / d[num_col].std()))
#     display(df1.head())

# # 1.6 Rename columns (safe snake_case)
# print("\n-- Rename (snake_case) --")
# def snake(s: str) -> str:
#     return (s.strip().lower()
#               .replace(" ", "_").replace("-", "_")
#               .replace("(", "").replace(")", ""))
# df_ren = df.copy()
# df_ren.columns = [snake(c) for c in df_ren.columns]
# print(df_ren.columns.tolist())
# # """


# # =========================================================
# # DAY 2 — Missing Values, Dtypes, Casting, Duplicates, Memory
# # =========================================================
# # UNCOMMENT to practice Day-2
# # """
# print("\n================= DAY 2 =================")

# # 2.1 Missing values
# print("\n-- Missing Values --")
# display(df_ren.isna().sum())
# df_fill = df_ren.copy()
# for c in df_fill.columns:
#     if pd.api.types.is_numeric_dtype(df_fill[c]):
#         df_fill[c] = df_fill[c].fillna(df_fill[c].median())
#     else:
#         df_fill[c] = df_fill[c].fillna("Unknown")
# display(df_fill.head())

# # 2.2 Dtypes & casting
# print("\n-- Dtypes & Casting --")
# print(df_ren.dtypes)
# # Example: convert object → category for strings with low cardinality
# df_cast = df_ren.copy()
# for c in df_cast.columns:
#     if df_cast[c].dtype == "object" and df_cast[c].nunique() < len(df_cast)*0.5:
#         df_cast[c] = df_cast[c].astype("category")
# print(df_cast.dtypes)

# # 2.3 Remove duplicates
# print("\n-- Duplicates --")
# before = len(df_cast)
# df_cast = df_cast.drop_duplicates()
# after = len(df_cast)
# print(f"Removed {before - after} duplicate rows")

# # 2.4 Memory usage tips
# print("\n-- Memory Usage (MB) --")
# mem_mb = df_cast.memory_usage(deep=True).sum() / 1_048_576
# print(f"Memory: {mem_mb:.3f} MB")

# # 2.5 to_numeric with errors
# print("\n-- to_numeric --")
# # demo: create a messy column
# df_cast["messy_num"] = ["10","20","x","30.5", None]
# df_cast["messy_num_num"] = pd.to_numeric(df_cast["messy_num"], errors="coerce")
# display(df_cast[["messy_num","messy_num_num"]].head())
# # """


# # =========================================================
# # DAY 3 — GroupBy, Aggregations, Pivot, Crosstab, Apply
# # =========================================================
# # UNCOMMENT to practice Day-3
# # """
# print("\n================= DAY 3 =================")

# # Pick cat & num col
# cat_col, num_col2 = None, None
# for c in df_ren.columns:
#     if pd.api.types.is_categorical_dtype(df_ren[c]) or df_ren[c].dtype == "object":
#         cat_col = c; break
# for c in df_ren.columns:
#     if pd.api.types.is_numeric_dtype(df_ren[c]):
#         num_col2 = c; break

# # 3.1 Basic groupby
# print("\n-- groupby().agg() --")
# if cat_col and num_col2:
#     gb = df_ren.groupby(cat_col).agg(
#         rows = (num_col2, "size"),
#         mean_val = (num_col2, "mean"),
#         median_val = (num_col2, "median"),
#         max_val = (num_col2, "max"),
#         min_val = (num_col2, "min"),
#     ).reset_index()
#     display(gb.sort_values("rows", ascending=False).head())

# # 3.2 pivot_table
# print("\n-- pivot_table --")
# # Need two cats + one value; make a second cat col if needed
# other_cat = None
# for c in df_ren.columns:
#     if c != cat_col and (pd.api.types.is_categorical_dtype(df_ren[c]) or df_ren[c].dtype == "object"):
#         other_cat = c; break

# if cat_col and other_cat and num_col2:
#     pvt = pd.pivot_table(df_ren, values=num_col2, index=cat_col, columns=other_cat,
#                          aggfunc="mean", fill_value=0)
#     display(pvt)

# # 3.3 Crosstab (frequency table)
# print("\n-- pd.crosstab --")
# if cat_col and other_cat:
#     ct = pd.crosstab(df_ren[cat_col], df_ren[other_cat])
#     display(ct)

# # 3.4 Apply on groups
# print("\n-- groupby().apply --")
# if cat_col and num_col2:
#     def summarize(g):
#         return pd.Series({
#             "rows": len(g),
#             "mean": g[num_col2].mean(),
#             "std": g[num_col2].std()
#         })
#     display(df_ren.groupby(cat_col).apply(summarize).reset_index())
# # """


# # =========================================================
# # DAY 4 — Merge/Join/Concat, Reshape (melt, stack/unstack), MultiIndex
# # =========================================================
# # UNCOMMENT to practice Day-4
# # """
# print("\n================= DAY 4 =================")

# # Synthetic small tables for joins
# customers = pd.DataFrame({
#     "cust_id": [1,2,3,4],
#     "name": ["Amit","Riya","Karan","Neha"],
#     "city": ["Delhi","Noida","Gurgaon","Delhi"]
# })
# orders = pd.DataFrame({
#     "order_id": [101,102,103,104],
#     "cust_id": [1,1,2,5],   # 5 not in customers
#     "amount":  [2999,1599,899,1200]
# })

# print("-- customers --"); display(customers)
# print("-- orders --"); display(orders)

# # 4.1 Joins: inner/left/right/outer
# inner = customers.merge(orders, on="cust_id", how="inner")
# left  = customers.merge(orders, on="cust_id", how="left")
# right = customers.merge(orders, on="cust_id", how="right")
# outer = customers.merge(orders, on="cust_id", how="outer", indicator=True)

# print("\n-- INNER --"); display(inner)
# print("\n-- LEFT --"); display(left)
# print("\n-- RIGHT --"); display(right)
# print("\n-- OUTER --"); display(outer)

# # 4.2 Concat (vertical & horizontal)
# print("\n-- Concat --")
# customers2 = pd.DataFrame({"cust_id":[5,6], "name":["Meera","Vikram"], "city":["Ghaziabad","Faridabad"]})
# vcat = pd.concat([customers, customers2], ignore_index=True)
# hcat = pd.concat([customers.set_index("cust_id"), customers2.set_index("cust_id")], axis=1)
# display(vcat); display(hcat)

# # 4.3 Melt (wide → long)
# print("\n-- Melt --")
# wide = pd.DataFrame({
#     "id":[1,2,3],
#     "Apr":[10,20,30],
#     "May":[15,25,35],
#     "Jun":[18,22,31]
# })
# display(wide)
# long = wide.melt(id_vars="id", var_name="month", value_name="value")
# display(long)

# # 4.4 Stack/Unstack with MultiIndex
# print("\n-- Stack/Unstack --")
# mi = long.set_index(["id","month"]).sort_index()
# unstacked = mi.unstack("month")      # back to wide
# restacked = unstacked.stack("month") # back to long-style
# display(unstacked)
# display(restacked)
# # """


# # =========================================================
# # DAY 5 — Datetime, Time Series, Rolling/Expanding, String Ops, Category
# # =========================================================
# # UNCOMMENT to practice Day-5
# # """
# print("\n================= DAY 5 =================")

# # 5.1 Datetime parsing & features
# print("\n-- Datetime basics --")
# dates = pd.date_range("2025-08-20", periods=10, freq="D")
# ts = pd.DataFrame({"date": dates, "sales": np.random.randint(50, 150, size=10)})
# ts["week"]  = ts["date"].dt.isocalendar().week
# ts["month"] = ts["date"].dt.month
# ts["dow"]   = ts["date"].dt.day_name()
# display(ts.head())

# # 5.2 Resample (needs DatetimeIndex)
# print("\n-- Resample (W) --")
# ts2 = ts.set_index("date").resample("W").sum(numeric_only=True)
# display(ts2)

# # 5.3 Rolling window
# print("\n-- Rolling mean (3-day) --")
# ts["roll3"] = ts["sales"].rolling(3, min_periods=1).mean()
# display(ts)

# # 5.4 Expanding (cumulative)
# print("\n-- Expanding mean --")
# ts["exp_mean"] = ts["sales"].expanding().mean()
# display(ts)

# # 5.5 String ops (str methods)
# print("\n-- String ops --")
# s = pd.Series(["  Delhi  ", "noida", None, "Gurgaon"])
# display(pd.DataFrame({
#     "raw": s,
#     "strip": s.str.strip(),
#     "upper": s.str.upper(),
#     "contains_no": s.str.contains("no", case=False, na=False)
# }))

# # 5.6 Categorical dtype
# print("\n-- Categorical --")
# cats = pd.Series(["High","Low","Medium","High","Low"], dtype="category")
# cats = cats.cat.set_categories(["Low","Medium","High"], ordered=True)
# display(cats)
# # """


# # =========================================================
# # DAY 6 — Apply/Map/Applymap, Vectorization, Eval/Query, Plotting
# # =========================================================
# # UNCOMMENT to practice Day-6
# # """
# print("\n================= DAY 6 =================")

# # 6.1 apply on Series & DataFrame
# print("\n-- apply / map / applymap --")
# demo = pd.DataFrame({"x":[1,2,3,4,5], "y":[10,20,30,40,50]})
# demo["x2"] = demo["x"].map(lambda z: z*z)
# demo["xy"] = demo.apply(lambda r: r["x"]*r["y"], axis=1)
# display(demo)

# # 6.2 Vectorization vs apply
# print("\n-- Vectorization --")
# demo["fast_xy"] = demo["x"]*demo["y"]  # vectorized, faster
# display(demo)

# # 6.3 eval & query
# print("\n-- eval & query --")
# demo = demo.eval("z = x + y")
# display(demo.query("z > 30"))

# # 6.4 Plotting (Matplotlib only, no seaborn)
# print("\n-- Plotting --")
# plt.figure()
# demo.plot(x="x", y="y", kind="line", title="Simple Line")
# plt.show()

# plt.figure()
# demo.plot(x="x", y=["y","z"], kind="bar", title="Bar Demo")
# plt.show()
# # """


# # =========================================================
# # DAY 7 — I/O Everywhere: CSV, Excel, JSON, Parquet, Feather, SQL
# # =========================================================
# # UNCOMMENT to practice Day-7
# # """
# print("\n================= DAY 7 =================")

# OUT = "/content/pandas_outputs"
# os.makedirs(OUT, exist_ok=True)

# # 7.1 CSV
# print("\n-- CSV --")
# df_ren.to_csv(f"{OUT}/data.csv", index=False)
# print("Wrote CSV:", f"{OUT}/data.csv")
# df_back = pd.read_csv(f"{OUT}/data.csv")
# display(df_back.head())

# # 7.2 Excel (openpyxl writer)
# print("\n-- Excel --")
# with pd.ExcelWriter(f"{OUT}/data.xlsx", engine="openpyxl") as w:
#     df_ren.to_excel(w, sheet_name="Sheet1", index=False)
# print("Wrote Excel:", f"{OUT}/data.xlsx")

# # 7.3 JSON (records)
# print("\n-- JSON --")
# df_ren.to_json(f"{OUT}/data.json", orient="records", lines=False)
# print("Wrote JSON:", f"{OUT}/data.json")

# # 7.4 Parquet (pyarrow/fastparquet)
# print("\n-- Parquet --")
# df_ren.to_parquet(f"{OUT}/data.parquet", engine="pyarrow", index=False)
# print("Wrote Parquet:", f"{OUT}/data.parquet")
# df_pq = pd.read_parquet(f"{OUT}/data.parquet")
# display(df_pq.head())

# # 7.5 Feather (fast columnar)
# print("\n-- Feather --")
# df_ren.to_feather(f"{OUT}/data.feather")
# print("Wrote Feather:", f"{OUT}/data.feather")

# # 7.6 SQLite (SQL I/O)
# print("\n-- SQLite --")
# sql_path = f"{OUT}/example.db"
# conn = sqlite3.connect(sql_path)
# df_ren.to_sql("mytable", conn, if_exists="replace", index=False)
# back = pd.read_sql_query("SELECT * FROM mytable LIMIT 5", conn)
# display(back)
# conn.close()
# print("Wrote SQLite DB:", sql_path)
# # """


# # =========================================================
# # ADVANCED — Performance, Chunking, Read Options, Pipe, Styler, Testing
# # =========================================================
# # UNCOMMENT to practice ADVANCED
# # """
# print("\n================= ADVANCED =================")

# # A1) Efficient read_csv with dtypes, parse_dates, usecols
# print("\n-- Efficient read_csv --")
# # Suppose we know columns & dtypes; (here we reuse our CSV)
# dtypes_map = {}
# for c in df_ren.columns:
#     if pd.api.types.is_integer_dtype(df_ren[c]):
#         dtypes_map[c] = "Int64"   # nullable int
#     elif pd.api.types.is_float_dtype(df_ren[c]):
#         dtypes_map[c] = "float32"
#     elif df_ren[c].dtype == "object" and df_ren[c].nunique() < len(df_ren)*0.5:
#         dtypes_map[c] = "category"
#     else:
#         dtypes_map[c] = "object"

# df_fast = pd.read_csv(f"{OUT}/data.csv",
#                       dtype=dtypes_map,
#                       parse_dates=[c for c in df_ren.columns if "date" in c.lower()],
#                       usecols=df_ren.columns[:min(6, len(df_ren.columns))])
# print(df_fast.dtypes)
# display(df_fast.head())

# # A2) Chunked processing (large files)
# print("\n-- Chunked read_csv --")
# tot_rows, sum_first_num = 0, 0.0
# first_num = None
# for c in df_ren.columns:
#     if pd.api.types.is_numeric_dtype(df_ren[c]):
#         first_num = c; break
# if first_num:
#     for chunk in pd.read_csv(f"{OUT}/data.csv", chunksize=2000):
#         tot_rows += len(chunk)
#         if first_num in chunk.columns:
#             sum_first_num += chunk[first_num].fillna(0).sum()
#     print("Total rows (chunked):", tot_rows)
#     print(f"Sum({first_num}) over chunks:", sum_first_num)

# # A3) Pipe pattern for clean transforms
# print("\n-- pipe pattern --")
# def clean_cols(d: pd.DataFrame) -> pd.DataFrame:
#     out = d.copy()
#     out.columns = [snake(c) for c in out.columns]
#     return out

# def add_ratio(d: pd.DataFrame, a: str, b: str, outcol="ratio") -> pd.DataFrame:
#     if a in d.columns and b in d.columns:
#         d[outcol] = d[a].astype(float) / d[b].replace(0, np.nan).astype(float)
#     return d

# piped = (df
#          .pipe(clean_cols)
#          .pipe(lambda d: d.assign(const=1))
#          )
# display(piped.head())

# # A4) Style (for reporting)
# print("\n-- Styler --")
# styled = (piped
#           .head(10)
#           .style
#           .highlight_max(axis=0)
#           .format(precision=2))
# display(styled)

# # A5) Testing small invariants
# print("\n-- Simple assertions --")
# assert len(df) == len(df.drop_duplicates()), "Data has duplicates! (Demo check; ignore if triggered)"
# print("Assertions demo (may raise above if not true) — OK to comment out.")
# # """


# # =========================================================
# # MINI PROJECT — End-to-End (Load → Clean → Transform → Analyze → Export)
# # =========================================================
# # UNCOMMENT to practice MINI PROJECT
# # """
# print("\n================= MINI PROJECT =================")

# # 1) Load already done into df

# # 2) Clean: standardize columns, trim, handle missing
# def clean_basic(d: pd.DataFrame) -> pd.DataFrame:
#     out = d.copy()
#     out.columns = [snake(c) for c in out.columns]
#     for c in out.columns:
#         if out[c].dtype == "object":
#             out[c] = out[c].astype(str).str.strip()
#     # simple impute
#     for c in out.columns:
#         if pd.api.types.is_numeric_dtype(out[c]):
#             out[c] = out[c].fillna(out[c].median())
#         else:
#             out[c] = out[c].replace(["nan","None"], np.nan).fillna("Unknown")
#     return out

# clean = clean_basic(df)
# display(clean.head())

# # 3) Transform: feature engineering demo
# num_cols = [c for c in clean.columns if pd.api.types.is_numeric_dtype(clean[c])]
# str_cols = [c for c in clean.columns if clean[c].dtype == "object"]
# if num_cols:
#     x = num_cols[0]
#     clean["zscore"] = (clean[x] - clean[x].mean()) / clean[x].std(ddof=0)
#     clean["bucket"] = np.where(clean[x] <= clean[x].mean(), "LOW", "HIGH")

# # 4) Analyze
# print("\n-- Null report --")
# display(clean.isna().sum())

# if str_cols and num_cols:
#     ccat, cnum = str_cols[0], num_cols[0]
#     grp = (clean.groupby(ccat, dropna=False)
#                  .agg(rows=(cnum, "size"),
#                       avg=(cnum, "mean"))
#                  .sort_values("rows", ascending=False)
#                  .reset_index())
#     display(grp.head())

# # 5) Export
# PROJECT_OUT = "/content/pandas_project_output"
# os.makedirs(PROJECT_OUT, exist_ok=True)
# clean.to_parquet(f"{PROJECT_OUT}/clean.parquet", index=False)
# grp.to_csv(f"{PROJECT_OUT}/group_summary.csv", index=False)
# print("✅ Project outputs at:", PROJECT_OUT)
# # """


# print("\n🎯 All sections ready. Comment/Uncomment as you learn. Happy Pandas-ing! 🐼")
