In [None]:
import json
from pathlib import Path

# === CONFIGURATION ===
# /Telco/_T2/Level_3/notebooks/02_DQ_IF.ipynb
NOTEBOOK_PATH = "/Users/b/DATA/PROJECTS/Telco/_T2/Level_3/notebooks/02_DQ_IF.ipynb"  # <-- Change to your notebook path

THRESHOLDS = {
    "max_markdown_cells": 100,
    "max_code_cells": 150,
    "max_avg_markdown_length": 500,
    "max_avg_code_length": 700,
    "max_consecutive_markdown": 5,
    "max_md_code_ratio": 2.0,
}

# === ANALYSIS SCRIPT ===
def analyze_notebook(path):
    with open(path) as f:
        nb = json.load(f)

    code_cells = []
    md_cells = []
    all_cells = nb.get("cells", [])
    md_streaks = []
    current_md_streak = []

    for i, cell in enumerate(all_cells):
        if cell["cell_type"] == "markdown":
            md_cells.append((i, cell))
            current_md_streak.append(i)
        elif cell["cell_type"] == "code":
            code_cells.append((i, cell))
            if current_md_streak:
                md_streaks.append(current_md_streak)
                current_md_streak = []
    if current_md_streak:
        md_streaks.append(current_md_streak)

    report = []
    md_lengths = [len("".join(cell["source"])) for _, cell in md_cells]
    code_lengths = [len("".join(cell["source"])) for _, cell in code_cells]

    md_count = len(md_cells)
    code_count = len(code_cells)
    avg_md_len = sum(md_lengths)/md_count if md_count else 0
    avg_code_len = sum(code_lengths)/code_count if code_count else 0
    md_code_ratio = md_count / max(code_count, 1)

    # === REPORT CHECKS ===
    if md_count > THRESHOLDS["max_markdown_cells"]:
        report.append(f"‚ö†Ô∏è Markdown cells: {md_count} exceeds threshold ({THRESHOLDS['max_markdown_cells']})")

    if code_count > THRESHOLDS["max_code_cells"]:
        report.append(f"‚ö†Ô∏è Code cells: {code_count} exceeds threshold ({THRESHOLDS['max_code_cells']})")

    if avg_md_len > THRESHOLDS["max_avg_markdown_length"]:
        long_md = [i for i, l in zip([i for i, _ in md_cells], md_lengths) if l > THRESHOLDS["max_avg_markdown_length"]]
        report.append(f"‚ö†Ô∏è Avg Markdown cell length: {avg_md_len:.1f} exceeds {THRESHOLDS['max_avg_markdown_length']} (cells: {long_md})")

    if avg_code_len > THRESHOLDS["max_avg_code_length"]:
        long_code = [i for i, l in zip([i for i, _ in code_cells], code_lengths) if l > THRESHOLDS["max_avg_code_length"]]
        report.append(f"‚ö†Ô∏è Avg Code cell length: {avg_code_len:.1f} exceeds {THRESHOLDS['max_avg_code_length']} (cells: {long_code})")

    long_streaks = [streak for streak in md_streaks if len(streak) > THRESHOLDS["max_consecutive_markdown"]]
    for streak in long_streaks:
        report.append(f"‚ö†Ô∏è Long markdown streak: {len(streak)} cells ‚Üí indices: {streak}")

    if md_code_ratio > THRESHOLDS["max_md_code_ratio"]:
        report.append(f"‚ö†Ô∏è High markdown/code ratio: {md_code_ratio:.2f} exceeds {THRESHOLDS['max_md_code_ratio']}")

    print("\nüìä Notebook Complexity Report")
    print("="*35)
    print(f"üìÑ Markdown cells: {md_count}")
    print(f"üìü Code cells: {code_count}")
    print(f"‚úèÔ∏è Avg markdown length: {avg_md_len:.1f}")
    print(f"üßÆ Avg code length: {avg_code_len:.1f}")
    print(f"üìä Markdown/code ratio: {md_code_ratio:.2f}")
    print("\nüîç Issues Found:")
    if report:
        for r in report:
            print(r)
    else:
        print("‚úÖ No major complexity issues detected.")

# === RUN ===
if __name__ == "__main__":
    analyze_notebook(NOTEBOOK_PATH)


In [None]:
import re
import sys
from collections import defaultdict, Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import get_ipython

# 1) Aggregate DataFrames by size (FIXED: static globals copy)
print("üîç PERFORMANCE AUDIT")
print("=" * 60)

# ‚úÖ FIXED: Create static copy to avoid mutation during iteration
globals_copy = dict(globals())  # Static snapshot
df_sizes = []
for name, obj in globals_copy.items():
    if isinstance(obj, pd.DataFrame):
        size_mb = obj.memory_usage(deep=True).sum() / (1024**2)
        df_sizes.append({"name": name, "shape": obj.shape, "size_mb": size_mb})

df_sizes_df = pd.DataFrame(df_sizes).sort_values("size_mb", ascending=False)
print("\nüìä 1. DataFrames by SIZE (MB)")
display(df_sizes_df.head(10))

total_df_mem = df_sizes_df["size_mb"].sum()
print(f"üíæ TOTAL DataFrame memory: {total_df_mem:.1f} MB")

# 2) Heavy loops (unchanged)
heavy_loops = []
try:
    notebook = get_ipython().history_manager.input_hist.raw_reset()
    for i, cell in enumerate(notebook[-50:], 1):
        lines = cell.split('\n')
        for line_num, line in enumerate(lines, 1):
            if re.search(r'for\s+\w+\s+in\s+range?\(', line) or 'for i in range' in line:
                heavy_loops.append({"cell": i, "line": line_num, "code": line.strip()[:80]})
            if any(x in line for x in ['for i in', '[i for i in', 'apply(', 'iterrows']):
                heavy_loops.append({"cell": i, "line": line_num, "code": line.strip()[:80]})
except:
    print("‚ö†Ô∏è Could not access notebook history")
    
print(f"\nüîÑ 2. Heavy LOOPS/CALCS: {len(heavy_loops)} found")
if heavy_loops:
    display(pd.DataFrame(heavy_loops).drop_duplicates())

# 3) Plots (simplified)
plot_calls = []
try:
    for i, cell in enumerate(notebook[-50:], 1):
        if any(x in cell.lower() for x in ['plt.plot', 'plt.scatter', 'sns', 'plotly']):
            data_size = sum(len(re.findall(r'\d+', cell)) for cell in cell.split(';'))
            if data_size > 750:
                plot_calls.append({"cell": i, "code_snip": cell[:100], "est_points": data_size})
except:
    pass

print(f"\nüìà 3. PLOTS >750 points: {len(plot_calls)} found")
if plot_calls:
    display(pd.DataFrame(plot_calls))

# 4) CSV calls (unchanged)
csv_calls = Counter()
try:
    for i, cell in enumerate(notebook[-100:], 1):
        csv_count = len(re.findall(r'pd\.read_csv\s*\(', cell))
        if csv_count > 0:
            csv_calls[i] += csv_count
except:
    pass

print(f"\nüìÇ 4. pd.read_csv() calls: {sum(csv_calls.values())} total")
csv_df = pd.DataFrame([{"cell": cell, "count": count} for cell, count in csv_calls.most_common()])
if not csv_df.empty:
    display(csv_df)

# Plot
plt.figure(figsize=(10, 6))
if not df_sizes_df.empty:
    top_dfs = df_sizes_df.head(10)
    plt.barh(range(len(top_dfs)), top_dfs['size_mb'])
    plt.yticks(range(len(top_dfs)), top_dfs['name'])
    plt.xlabel('Memory (MB)')
    plt.title('Top 10 DataFrames by Memory Usage')
    plt.tight_layout()
    plt.show()

print("\n‚úÖ AUDIT COMPLETE!")

import json
from pathlib import Path
import pandas as pd
from IPython.display import display

# üîß Set this to the *current* notebook filename
NB_NAME = "02_DQ_IF.ipynb"   # change if your notebook name differs

nb_path = Path(NB_NAME).resolve()
print("Notebook path:", nb_path)

first_lines = []
if nb_path.exists():
    with nb_path.open("r", encoding="utf-8") as f:
        nb = json.load(f)

    for i, cell in enumerate(nb.get("cells", []), start=1):
        if cell.get("cell_type") != "code":
            continue

        src = cell.get("source", [])
        if isinstance(src, str):
            lines = src.splitlines()
        else:
            lines = [str(x) for x in src]

        first = lines[0].rstrip("\n") if lines else ""
        first_lines.append({"cell_index": i, "first_line": first})

    df_first_lines = pd.DataFrame(first_lines)
    display(df_first_lines)
else:
    print("‚ùå Notebook file not found:", nb_path)

import json
from pathlib import Path
from IPython.display import display, Markdown
from IPython.core.getipython import get_ipython
import pandas as pd

# 1) Set this to the current notebook file name
NB_NAME = "02_DQ_IF.ipynb"   # <-- change if different
nb_path = Path(NB_NAME).resolve()
print("Notebook path:", nb_path)

if not nb_path.exists():
    raise FileNotFoundError(f"Notebook file not found: {nb_path}")

# 2) Collect first line of every code cell
with nb_path.open("r", encoding="utf-8") as f:
    nb = json.load(f)

first_lines = []
for i, cell in enumerate(nb.get("cells", []), start=1):
    if cell.get("cell_type") != "code":
        continue
    src = cell.get("source", [])
    if isinstance(src, str):
        lines = src.splitlines()
    else:
        lines = [str(x) for x in src]
    first = lines[0].rstrip("\n") if lines else ""
    first_lines.append((i, first))

df_first_lines = pd.DataFrame(first_lines, columns=["cell_index", "first_line"])
display(df_first_lines)

# 3) Build markdown list
md_lines = ["# Code cell first lines", ""]
for idx, line in first_lines:
    safe = line.replace("\n", " ").replace("|", r"\|")
    md_lines.append(f"- **Cell {idx}**: `{safe}`")
md_text = "\n".join(md_lines)

# 4) Inject markdown into the *next* cell
shell = get_ipython()
shell.set_next_input(md_text, replace=False)

print("\n‚úÖ A new cell has been pre-filled below.")
print("üëâ Change its type to *Markdown* (press 'M' in command mode) and run it.")

import nbformat
from nbformat import read
from pathlib import Path
import re
import pandas as pd

# --- config ---
NB_PATH = Path("02_DQ_IF.ipynb")  # change to your notebook path

# --- heuristics for DataFrame creation patterns ---
DF_PATTERNS = [
    r"^\s*(\w+)\s*=\s*pd\.read_csv\(",          # df = pd.read_csv(...)
    r"^\s*(\w+)\s*=\s*pd\.read_parquet\(",
    r"^\s*(\w+)\s*=\s*pd\.DataFrame\(",
    r"^\s*(\w+)\s*=\s*\w+\.copy\(",
    r"^\s*(\w+)\s*=\s*\w+\.merge\(",
    r"^\s*(\w+)\s*=\s*\w+\.join\(",
    r"^\s*(\w+)\s*=\s*\w+\.groupby\(",
    r"^\s*(\w+)\s*=\s*\w+\.pivot",             # pivot / pivot_table
    r"^\s*(\w+)\s*=\s*df_[a-zA-Z0-9_]+\b",     # df_x = df_y
]

compiled_patterns = [re.compile(p) for p in DF_PATTERNS]

def find_dataframe_creations(nb_path: Path) -> pd.DataFrame:
    with nb_path.open("r", encoding="utf-8") as f:
        nb = read(f, as_version=4)  # v4 is standard Jupyter format [web:274]

    rows = []

    for cell_idx, cell in enumerate(nb.get("cells", []), start=1):
        if cell.get("cell_type") != "code":
            continue

        src = cell.get("source", "")
        # nbformat may store source as list or str
        if isinstance(src, list):
            lines = src
        else:
            lines = src.splitlines()

        for line_no, line in enumerate(lines, start=1):
            for pat in compiled_patterns:
                m = pat.match(line)
                if m:
                    var_name = m.group(1)
                    rows.append(
                        {
                            "cell_index": cell_idx,
                            "line_no": line_no,
                            "var_name": var_name,
                            "code_line": line.strip(),
                        }
                    )
                    break  # avoid double-counting same line

    return pd.DataFrame(rows)

df_created = find_dataframe_creations(NB_PATH)
print(f"Found {len(df_created)} potential DataFrame creations")
display(df_created)

# Agg all LIVE DataFrames
import pandas as pd

engine_dfs = []

for name, obj in globals().items():
    if isinstance(obj, pd.DataFrame):
        engine_dfs.append(
            {
                "name": name,
                "rows": obj.shape[0],
                "cols": obj.shape[1],
                "memory_mb": obj.memory_usage(deep=True).sum() / (1024 ** 2),
            }
        )

df_registry = pd.DataFrame(engine_dfs).sort_values("memory_mb", ascending=False)
df_registry

engine_registry = df_registry[
    ~df_registry["name"].str.contains("tmp|test|debug", case=False, na=False)
].reset_index(drop=True)

display(engine_registry)


In [None]:
# src_nb = "/Users/b/DATA/PROJECTS/Telco/Level_3/notebooks/02_DQ_IF.ipynb"
# out_nb = "/Users/b/DATA/PROJECTS/Telco/Level_3/outputs/_AGG/02_DQ_IF_AGG.ipynb"

# aggregate_code_cells(src_nb, out_nb)