In [None]:
!pip -q install streamlit colabcode opencv-python pandas numpy pillow

In [None]:
!pip -q install kagglehub opencv-python pandas numpy tqdm pillow

import os, glob, kagglehub, pandas as pd, numpy as np, cv2
from tqdm import tqdm

# Download Kaggle dataset to local cache (~/.cache/kagglehub/...)
path = kagglehub.dataset_download("adinapunyobanerjee/youtube-thumbnail-dataset")
print("Downloaded to:", path)

# Find the folder that actually contains the images
candidates = glob.glob(os.path.join(path, "**", "*.jpg"), recursive=True)
print("Total JPGs found:", len(candidates))
# If 0, print the tree to inspect
if not candidates:
    for p in glob.glob(os.path.join(path, "**"), recursive=True):
        print(p)


In [None]:
def img_stats(fp):
    """Return sharpness, contrast, saturation, edge_density for an image path."""
    img = cv2.imread(fp)
    if img is None:
        return None
    h, w = img.shape[:2]
    area = float(h*w)

    # sharpness via Laplacian variance
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    sharp = cv2.Laplacian(gray, cv2.CV_64F).var()

    # contrast = std of grayscale
    contrast = gray.std()

    # saturation: convert to HSV and take mean S
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    saturation = hsv[...,1].mean()

    # edge density: Canny edges / area
    edges = cv2.Canny(gray, 100, 200)
    edge_density = edges.sum() / 255.0 / area

    return sharp, contrast, saturation, edge_density

rows = []
for fp in tqdm(candidates[:5000]):  # sample 5k to start (increase if you want)
    stats = img_stats(fp)
    if stats:
        rows.append((fp,)+stats)

df = pd.DataFrame(rows, columns=["filepath","sharpness","contrast","saturation","edge_density"])

# normalize 0-1 each feature (robust scaling)
for c in ["sharpness","contrast","saturation","edge_density"]:
    q1, q99 = df[c].quantile(0.01), df[c].quantile(0.99)
    df[c+"_norm"] = (df[c].clip(q1, q99) - q1) / (q99 - q1 + 1e-8)

# weighted combo -> aesthetic_proxy
df["aesthetic_proxy"] = (
    0.35*df["sharpness_norm"] +
    0.30*df["contrast_norm"] +
    0.25*df["saturation_norm"] +
    0.10*df["edge_density_norm"]
)

out_csv = "/content/yt_thumbs_aesthetic_proxy.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

df.sort_values("aesthetic_proxy", ascending=False).head(10)


In [None]:
from IPython.display import display
display(df.sort_values("aesthetic_proxy", ascending=False).head(20)[
    ["filepath","aesthetic_proxy","sharpness","contrast","saturation","edge_density"]
])


In [None]:
from IPython.display import Image, display

top10 = df.sort_values("aesthetic_proxy", ascending=False).head(10)

for _, row in top10.iterrows():
    print(f"Score: {row['aesthetic_proxy']:.3f} | File: {row['filepath']}")
    display(Image(filename=row["filepath"]))


In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("PLACE_YOUR_TOKEN_HERE")

In [None]:
# 1) streamlit pyngrok
!pip -q install streamlit pyngrok==7.1.5

# 2) backgroundStreamlit & log
!pkill -f "streamlit run" || true
!fuser -k 8501/tcp || true
!nohup streamlit run /content/streamlit_app.py \
  --server.port 8501 --server.headless true \
  --server.enableCORS false --server.enableXsrfProtection false \
  > /content/streamlit.log 2>&1 &



In [None]:
public_url = ngrok.connect(8501, "http")
print("✅ Public URL:", public_url.public_url)