In [1]:
# 🛠️ 1.1  Environment – install external libs (≈3 min)
!pip -q install ydata-profiling keybert sentence-transformers rapidfuzz fastapi uvicorn

# 1.2  Imports & helpers
import pandas as pd, numpy as np, re, os, textwrap
import torch
from pathlib import Path
from ydata_profiling import ProfileReport
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from rapidfuzz import process, fuzz
from sklearn.cluster import AgglomerativeClustering
from huggingface_hub import login
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.1/400.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m687.8/687.8 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m91.2 MB/s[0m eta [36m0:0

In [2]:
# 🛠️ 1.3  Upload & load data
from google.colab import files, drive
uploaded = files.upload()             # choose TV_sample.zip
!unzip -q TV_sample.zip -d tv_data

CSV_PATH = "tv_data/TV_sample.csv"
df = pd.read_csv(CSV_PATH)
print(df.shape)
df.head(3)


Saving TV_sample.zip to TV_sample.zip
(366933, 13)


Unnamed: 0,asset_id,duration,name,season,episode,description,year,actors,director,country,content_type,imdbid,genre
0,35858224,20.0,Blu užuominos,1.0,1.0,"""Blu užuominos"" – tai interaktyvus lavinamasis...",2019.0,"Joshua Dela Cruz, Traci Paige Johnson, Steve B...","Vadim Kapridov, Jeremy Slutskin, M.R. Horhager...",JAV,series,tt9000424,"Adventure, Comedy, Animation"
1,35851511,60.0,Sūrus bučinys,0.0,1.0,Jauni įsimylėjėliai Hermanas ir Julija svajoja...,2022.0,"Anastasiya Ivanova, Maksym Samchyk, Svetlana Z...",Aleksandr Budyonny,,series,,Drama
2,35833932,45.0,Bones,1.0,6.0,"Brilliant, but socially inept, forensic anthro...",2005.0,"Emily Deschanel, David Boreanaz, Michaela Conl...","Maggie Parker, Tawnia McKiernan, Handel Whitmo...",JAV,series,tt0460627,Crime


In [3]:
# 🕵🏻 1.4  Quick profiling (HTML saved to Colab Files)
profile = ProfileReport(df.sample(10000, random_state=42), title="TV Sample – Quick Profile")
profile.to_file("tv_profile.html")
print("📊 Profile ready – download from left pane ➜ Files ➜ tv_profile.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:00<00:05,  2.38it/s][A
100%|██████████| 13/13 [00:01<00:00, 10.39it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

📊 Profile ready – download from left pane ➜ Files ➜ tv_profile.html


In [4]:
# 🧹 2.1  Basic cleaning helpers
def clean_text(s):
    if pd.isna(s): return np.nan
    s = re.sub(r"\s+", " ", str(s)).strip()
    s = re.sub(r"[“”\"']", "", s)
    return s

for col in ["name", "description", "actors", "director", "genre", "country"]:
    df[col] = df[col].apply(clean_text)

# Standardise content_type (movie/series/event/other)
def infer_type(row):
    if pd.notna(row["content_type"]): return row["content_type"]
    if re.search(r"(s\d+e\d+|episode|season)", str(row["name"]), re.I):
        return "series"
    if re.search(r"( vs |basket|football|euroleague|world cup)", str(row["name"]), re.I):
        return "event"
    return "movie"
df["content_type"] = df.apply(infer_type, axis=1)

# Impute year from name "(1999)" pattern
year_rx = re.compile(r"\((19|20)\d{2}\)")
df["year"] = df.apply(
    lambda r: r["year"] if pd.notna(r["year"]) else
    int(year_rx.search(r["name"]).group(0)[1:-1]) if year_rx.search(r["name"]) else np.nan,
    axis=1
)


In [None]:
# ⚡ 📝 3.1  Fast, batched keyword extraction (GPU-aware & HF-authenticated)

# ① Authenticate once (expects HF_TOKEN saved in Colab “Secrets” or env var)
login(token=os.getenv("HF_TOKEN"))      # Tools ▸ Secrets ▸ add HF_TOKEN

# ② Pick device and load models (first pull is ~90 MB ⇒ 1-2 min with token)
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
kw_model    = KeyBERT(model=embed_model)

# ③ Batched extraction to avoid per-row overhead
def extract_keywords_batched(texts, batch_size=512, top_n=5):
    """
    texts: list[str]  →  returns list[list[str]] of keywords per doc.
    Uses MMR for diversity and processes up to `batch_size` docs at once.
    """
    results = []
    for start in tqdm(range(0, len(texts), batch_size)):
        chunk = texts[start:start + batch_size]
        kw_chunk = kw_model.extract_keywords(
            chunk,
            keyphrase_ngram_range=(1, 2),
            stop_words="english",
            top_n=top_n,
            use_mmr=True           # diversify phrases
        )
        # kw_chunk is list[list[(kw, score)]] – keep only kw strings
        results.extend([[kw for kw, _ in doc] for doc in kw_chunk])
    return results

# ④ Run on the whole column (2-3 × faster on GPU; ~5 × faster than row-loop)
descriptions = df["description"].fillna("").tolist()
df["keywords"] = extract_keywords_batched(descriptions, batch_size=512, top_n=5)

model = embed_model     # <-- alias so the old code still works



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  0%|          | 0/717 [00:00<?, ?it/s]

In [None]:
# 🕸️ 4.1  Record linkage – title blocking + SBERT clustering
def normalise_title(t): return re.sub(r"[^a-z0-9]+", " ", t.lower()).strip()

df["title_norm"] = df["name"].apply(normalise_title)
blocks = {}
for idx, title in enumerate(df["title_norm"]):
    key = title[:25]          # crude first-N char block
    blocks.setdefault(key, []).append(idx)

cluster_ids = np.full(len(df), -1)
current_cluster = 0

for idxs in blocks.values():
    if len(idxs) == 1:
        cluster_ids[idxs[0]] = current_cluster; current_cluster += 1
        continue
    emb = model.encode(df.loc[idxs, "title_norm"].tolist(), show_progress_bar=False)
    clust = AgglomerativeClustering(n_clusters=None, distance_threshold=0.45, metric='cosine').fit(emb)
    for local, global_idx in enumerate(idxs):
        cluster_ids[global_idx] = current_cluster + clust.labels_[local]
    current_cluster += clust.labels_.max() + 1

df["cluster_id"] = cluster_ids
print("Created", df["cluster_id"].nunique(), "content groupings")


In [None]:
# 📈 5.1  Insights snapshot (can be ported to slide deck)
import matplotlib.pyplot as plt

fig = df["content_type"].value_counts().plot(kind='barh', title="Content-type mix (post-clean)").get_figure()
fig.savefig("content_type_mix.png", bbox_inches='tight')
plt.show()

missing = df.isna().mean().round(2).sort_values(ascending=False)
print("Missingness after basic fixes:\n", missing)


In [None]:
# 💾 6.1  Save cleaned & enriched sample
out_path = "tv_data_cleaned.parquet"
df.to_parquet(out_path, index=False)
print("✅ Saved", out_path)
