In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#get season/ep data from iniital scripts and combine with dialogue 
scripts = pd.read_csv("data/processed/clean_scripts_initial.csv")
dialogue = pd.read_csv("data/processed/clean_scripts_dialogue.csv")

dialogue["Season"] = scripts["Season"]
dialogue["Episode"] = scripts["Episode"]


episodes = (
    dialogue
    .groupby(["Season", "Episode"])["dialogue_text"]
    .apply(lambda x: " ".join(x.dropna()))
    .reset_index()
)


vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
tfidf_matrix = vectorizer.fit_transform(episodes["dialogue_text"])
feature_names = vectorizer.get_feature_names_out()

#function for top keywords 
def get_top_keywords(row, n=10):
    row_data = row.toarray().flatten()
    top_idx = row_data.argsort()[-n:][::-1]
    return [feature_names[i] for i in top_idx]

# top keywords per episode calculation
episodes["top_keywords"] = [
    get_top_keywords(tfidf_matrix[i], n=10)
    for i in range(tfidf_matrix.shape[0])
]


episodes = episodes[["Season", "Episode", "top_keywords"]]

episodes.to_csv("data/processed/episode_topkeywords.csv", index=False)

episodes.head()


Unnamed: 0,Season,Episode,top_keywords
0,season-01,e1-Winter is Coming,"[king, father, know, ll, don, grace, ned, walk..."
1,season-01,e10-Fire and Blood,"[lt, king, drogo, ll, night, lf, pyre, ve, lik..."
2,season-01,e2-The Kings Road,"[ll, king, butcher, direwolf, lord, don, liar,..."
3,season-01,e3-Lord Snow,"[don, ve, know, lord, like, ll, ned, boy, king..."
4,season-01,e4-Cripples Bastards and Broken Things,"[lord, king, ser, gregor, ll, arryn, ve, don, ..."
