In [None]:
!pip install -qU langchain-google-genai
!pip install google-generativeai langchain-core langchain
!pip install -qU langchain-google-genai
!pip install "langchain==0.1.14"
!pip install "langchain-core==0.1.42"
!pip install "langchain-community==0.0.29"
!pip install "langchain-google-genai==1.0.5"
!pip install -qU langgraph
!pip install -qU langgraph-checkpoint

In [None]:
import pandas as pd
import numpy as np
import pickle
import json
import re
import os
import google.generativeai as genai
from getpass import getpass
from typing import Optional, List
from langchain.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
from langgraph.graph import StateGraph, MessagesState, START, END
from langgraph.prebuilt import ToolNode
from langgraph.checkpoint.memory import MemorySaver
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
os.chdir("/content/drive/MyDrive/project")

In [None]:
df = pd.read_csv("movies.csv")

df.head()

Unnamed: 0,title,description,release_year,genres_list,rating,votes,directors,actors,runtime_minutes
0,#FriendButMarried 2,Ayudia (Mawar De Jongh) is not satisfied enoug...,2020.0,['Biography'],6.5,120,Rako Prijanto,"['Adipati Dolken', 'Mawar Eva de Jongh', 'Vonn...",100 minutes
1,4 Mantan,"Sara, Airin, Rachel, and Amara were accidental...",2020.0,['Thriller'],6.4,8,Hanny Saputra,"['Ranty Maria', 'Jeff Smith', 'Melanie Berentz...",80 minutes
2,Aku Tahu Kapan Kamu Mati,"After apparent death, Siena is able to see sig...",2020.0,['Horror'],5.4,17,Hadrah Daeng Ratu,"['Natasha Wilona', 'Ria Ricis', 'Al Ghazali', ...",92 minutes
3,Anak Garuda,"Good Morning Indonesia, a school for poor orph...",2020.0,['Adventure'],9.1,27,Faozan Rizal,"['Tissa Biani Azzahra', 'Violla Georgie', 'Aji...",129 minutes
4,Dignitate,Alfi (Al Ghazali) meets Alana (Caitlin Halderm...,2020.0,['Drama'],7.6,33,Fajar Nugros,"['Al Ghazali', 'Caitlin Halderman', 'Giorgino ...",109 minutes


In [None]:
def clean_description(x):
    if isinstance(x, str):
        x = x.lower()
        x = re.sub(r"[^a-zA-Z0-9\s]", " ", x)
        x = re.sub(r"\s+", " ", x).strip()
        return x
    return ""

In [None]:
def clean_actors(x):
    if isinstance(x, str):
        x = x.lower().replace(",", " ")
        x = re.sub(r"\s+", " ", x).strip()
        return x
    return ""

In [None]:
def clean_genres(x):
    if isinstance(x, list):
        x = " ".join(x)
    if isinstance(x, str):
        x = x.lower().replace(",", " ")
        x = re.sub(r"\s+", " ", x).strip()
        return x
    return ""

In [None]:
def clean_directors(x):
    if isinstance(x, str):
        x = x.lower().replace(",", " ")
        x = re.sub(r"\s+", " ", x).strip()
        return x
    return ""

In [None]:
df["soup"] = (
    df["description"].apply(clean_description) + " " +
    df["actors"].apply(clean_actors).str.replace(" ", "_") + " " +
    df["directors"].apply(clean_directors).str.replace(" ", "_") + " " +
    df["genres_list"].apply(clean_genres).str.replace(" ", "_")
)
df["soup"].head()

Unnamed: 0,soup
0,ayudia mawar de jongh is not satisfied enough ...
1,sara airin rachel and amara were accidentally ...
2,after apparent death siena is able to see sign...
3,good morning indonesia a school for poor orpha...
4,alfi al ghazali meets alana caitlin halderman ...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,3),
    max_features=50000
)

vectorizer.fit(df["soup"])
tfidf_matrix = vectorizer.transform(df["soup"])


In [None]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 359773 stored elements and shape (7432, 50000)>
  Coords	Values
  (0, 744)	0.20084387743839566
  (0, 2834)	0.10378210807000599
  (0, 5783)	0.15315872592676608
  (0, 5788)	0.24570401366704744
  (0, 7048)	0.24570401366704744
  (0, 8077)	0.17567809240045665
  (0, 14265)	0.20948887842178565
  (0, 26208)	0.21508220679188456
  (0, 27862)	0.2263838046013882
  (0, 28743)	0.15673711598489615
  (0, 29823)	0.24570401366704744
  (0, 33248)	0.19576199772622527
  (0, 34433)	0.16174273526698385
  (0, 35882)	0.21508220679188456
  (0, 35883)	0.24570401366704744
  (0, 39635)	0.2263838046013882
  (0, 39646)	0.24570401366704744
  (0, 39672)	0.24570401366704744
  (0, 40779)	0.1942549728867141
  (0, 44252)	0.14065220625854402
  (0, 44437)	0.16673337702597718
  (0, 48832)	0.23146568431355855
  (0, 49227)	0.1091856932308081
  (0, 49252)	0.23146568431355855
  (1, 344)	0.220932411916134
  :	:
  (7430, 32910)	0.2516476778000943
  (7430, 36297)	0.19303

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
pd.DataFrame(cosine_sim).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7422,7423,7424,7425,7426,7427,7428,7429,7430,7431
0,1.0,0.0,0.0,0.0,0.0,0.0,0.194003,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009978,0.0,0.0
1,0.0,1.0,0.012976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002551,0.0,0.025576,0.0,0.0,0.0,0.0,0.0,0.030893
2,0.0,0.012976,1.0,0.0,0.046187,0.039183,0.0,0.007983,0.010797,0.0,...,0.012594,0.016008,0.0,0.009203,0.0,0.0,0.0,0.0,0.02109,0.0
3,0.0,0.0,0.0,1.0,0.008965,0.098597,0.0,0.0,0.0,0.0,...,0.0,0.0,0.005463,0.0,0.007779,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.046187,0.008965,1.0,0.015239,0.0,0.0,0.0,0.001503,...,0.0,0.001085,0.016533,0.0,0.0,0.0,0.0,0.001649,0.0,0.0
5,0.0,0.0,0.039183,0.098597,0.015239,1.0,0.0,0.0,0.0,0.0,...,0.00405,0.0,0.013134,0.0,0.007407,0.004635,0.005244,0.0,0.004439,0.004421
6,0.194003,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.007676,...,0.0,0.0,0.003569,0.0,0.0,0.0,0.0,0.009161,0.0,0.0
7,0.0,0.0,0.007983,0.0,0.0,0.0,0.0,1.0,0.009734,0.0,...,0.0,0.0,0.0,0.008296,0.0,0.0,0.0,0.009711,0.019013,0.0
8,0.0,0.0,0.010797,0.0,0.0,0.0,0.0,0.009734,1.0,0.0,...,0.0,0.0,0.0,0.011221,0.0,0.0,0.0,0.0,0.025717,0.0
9,0.0,0.0,0.0,0.0,0.001503,0.0,0.007676,0.0,0.0,1.0,...,0.0,0.001049,0.0,0.0,0.007094,0.0,0.0,0.001594,0.0,0.0


In [None]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [None]:
def recommend_similar_movie(movie_title, top_n=5):
    if movie_title not in indices:
        return f"Film '{movie_title}' tidak ditemukan dalam data."

    idx = indices[movie_title]
    film = df.iloc[idx]

    title = film["title"]
    description = film["description"]
    release_year = film["release_year"]
    genres = ", ".join(film["genres_list"]) if isinstance(film["genres_list"], list) else film["genres_list"]
    rating = film["rating"]
    votes = film["votes"]
    directors = ", ".join(film["directors"]) if isinstance(film["directors"], list) else film["directors"]
    actors = ", ".join(film["actors"]) if isinstance(film["actors"], list) else film["actors"]
    runtime = film["runtime_minutes"]

    film_info = (
        f"Detail film '{title}':\n"
        f" Deskripsi     : {description}\n"
        f" Tahun rilis   : {release_year}\n"
        f" Genre         : {genres}\n"
        f" Rating        : {rating} ({votes} votes)\n"
        f" Sutradara     : {directors}\n"
        f" Aktor         : {actors}\n"
        f" Durasi        : {runtime} menit\n"
    )

    sim_row = cosine_sim[idx]
    scores = list(enumerate(sim_row))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_results = scores[1 : top_n + 1]

    hasil = []
    for film_index, score in top_results:
        f = df.iloc[film_index]
        f_title = f["title"]
        f_release = f["release_year"]
        f_genres = ", ".join(f["genres_list"]) if isinstance(f["genres_list"], list) else f["genres_list"]
        f_rating = f["rating"]
        f_runtime = f["runtime_minutes"]
        hasil.append(f"{f_title} ({f_release}) | Genre: {f_genres} | Rating: {f_rating} | Durasi: {f_runtime} menit | Skor kemiripan: {round(float(score), 4)}")

    return film_info + f"\nTop {top_n} rekomendasi film yang mirip '{title}':\n- " + "\n- ".join(hasil)

In [None]:
print(recommend_similar_movie("5 cm"))

Detail film '5 cm':
 Deskripsi     : Five best friends try to find out what true friendship is by climbing Mount Semeru, the highest peak in Java.
 Tahun rilis   : 2012.0
 Genre         : ['Adventure']
 Rating        : 7.2 (1,709 votes)
 Sutradara     : Rizal Mantovani
 Aktor         : ['Herjunot Ali', 'Fedi Nuril', 'Pevita Pearce', 'Didi Petet', 'Saykoji', 'Raline Shah', 'Denny Sumargo', nan, nan, nan, nan, nan, nan, nan, nan]
 Durasi        : 125 minutes menit

Top 5 rekomendasi film yang mirip '5 cm':
- Supernova (2014.0) | Genre: ['Drama'] | Rating: 6.4 | Durasi: 135 minutes menit | Skor kemiripan: 0.1824
- Taring (2010.0) | Genre: ['Horror'] | Rating: 5.2 | Durasi: 90 minutes menit | Skor kemiripan: 0.1643
- Sagarmatha (2015.0) | Genre: ['Adventure'] | Rating: 7.2 | Durasi: 98 minutes menit | Skor kemiripan: 0.1616
- Mati Suri (2009.0) | Genre: ['Horror'] | Rating: 5.5 | Durasi: 90 minutes menit | Skor kemiripan: 0.1227
- Doremi & You (2019.0) | Genre: ['Drama'] | Rating: 7.8 | Du

In [None]:
def rekomendasi_tahun(tahun, top_n=10):
    df_filter = df[df["release_year"] == tahun]

    if df_filter.empty:
        return f"Tidak ada film yang rilis pada tahun {tahun}."

    df_sorted = df_filter.sort_values(by="rating", ascending=False).head(top_n)

    hasil = []
    for _, row in df_sorted.iterrows():
        genres = ", ".join(row["genres_list"]) if isinstance(row["genres_list"], list) else row["genres_list"]
        hasil.append(f"{row['title']} | Rating: {row['rating']} | Genre: {genres}")

    return f"\nRekomendasi film tahun {tahun}:\n- " + "\n- ".join(hasil)

In [None]:
print(rekomendasi_tahun(2024))


Rekomendasi film tahun 2024:
- Cora Bora | Rating: 9.9 | Genre: ['Comedy', 'Drama']
- Bad Behaviour | Rating: 9.9 | Genre: ['Comedy', 'Drama', 'Horror', 'Thriller']
- The Grab | Rating: 9.6 | Genre: ['Documentary']
- Poolman | Rating: 9.5 | Genre: ['Comedy', 'Mystery']
- Just the Two of Us | Rating: 9.4 | Genre: ['Drama', 'Thriller']
- Tiger Stripes | Rating: 9.3 | Genre: ['Horror']
- Firebrand | Rating: 9.1 | Genre: ['Drama', 'History']
- Ultraman: Rising | Rating: 9.1 | Genre: ['Animation', 'Action', 'Adventure', 'Comedy', 'Family', 'Fantasy', 'Sci-Fi']
- 20,000 Species of Bees | Rating: 9.0 | Genre: ['Drama']
- Tuesday | Rating: 9.0 | Genre: ['Drama', 'Fantasy']


In [36]:
GENRE_MAP = {
    "komedi": "comedy",
    "aksi": "action",
    "petualangan": "adventure",
    "drama": "drama",
    "horor": "horror",
    "thriller": "thriller",
    "romantis": "romance",
    "romance": "romance"
}

def rekomendasi_genre(genre, top_n=10):
    genre_raw = genre.lower().strip()
    genre_norm = GENRE_MAP.get(genre_raw, genre_raw)

    hasil = []

    for _, row in df.iterrows():
        g_list = str(row["genres_list"]).lower()
        g_list = g_list.replace("[", "").replace("]", "").replace("'", "")
        genre_items = [x.strip() for x in g_list.split(",")]

        if genre_norm in genre_items:
            hasil.append(row)

    if not hasil:
        return f"Tidak ada film dengan genre '{genre}'."

    df_filter = pd.DataFrame(hasil)

    # gunakan 'rating' atau 'rating_num'
    kolom_rating = "rating"
    if "rating_num" in df_filter.columns:
        kolom_rating = "rating_num"

    df_sorted = df_filter.sort_values(by=kolom_rating, ascending=False).head(top_n)

    response = f"\nRekomendasi film genre '{genre}':\n"
    for _, row in df_sorted.iterrows():
        response += f"- {row['title']} | Rating: {row[kolom_rating]} | Tahun: {row['release_year']}\n"

    return response


In [None]:
print(rekomendasi_genre("thriller"))


Rekomendasi film genre 'thriller':
- Bad Behaviour | Rating: 9.9 | Tahun: 2024.0
- Kalamity | Rating: 9.8 | Tahun: 2010.0
- The Moment | Rating: 9.8 | Tahun: 2014.0
- Vengeance | Rating: 9.8 | Tahun: 2022.0
- Inhale | Rating: 9.7 | Tahun: 2010.0
- Civil Brand | Rating: 9.7 | Tahun: 2003.0
- House of the Sleeping Beauties | Rating: 9.7 | Tahun: 2008.0
- The Black Waters of Echo's Pond | Rating: 9.7 | Tahun: 2010.0
- Heartbreak Hospital | Rating: 9.6 | Tahun: 2002.0
- The Unspoken | Rating: 9.6 | Tahun: 2016.0


In [None]:

def get_movies_by_rating(target_rating: float):
    """
    Mengambil semua film dengan rating EXACT (persis) sesuai angka.
    """
    df_local = df.copy()
    df_local["rating_num"] = pd.to_numeric(df_local["rating"], errors="coerce")
    hasil = df_local[df_local["rating_num"] == target_rating]
    return hasil


def get_top_rated_movies(limit: int = 5):
    """
    Film dengan rating tertinggi.
    """
    df_local = df.copy()
    df_local["rating_num"] = pd.to_numeric(df_local["rating"], errors="coerce")
    df_local = df_local.sort_values("rating_num", ascending=False)
    return df_local.head(limit)


def get_low_rated_movies(limit: int = 5):
    """
    Film dengan rating terendah.
    """
    df_local = df.copy()
    df_local["rating_num"] = pd.to_numeric(df_local["rating"], errors="coerce")
    df_local = df_local.sort_values("rating_num", ascending=True)
    return df_local.head(limit)



In [None]:
print( get_movies_by_rating(5))
print( get_top_rated_movies(5))
print( get_low_rated_movies(5))


In [None]:
def rekomendasi_aktor(nama_aktor, top_n=10):
    nama_aktor = nama_aktor.lower().strip().replace(" ", "_")

    df_filter = df[df["actors"].str.replace(" ", "_").str.contains(nama_aktor, case=False, na=False)]

    if df_filter.empty:
        return f"Tidak ada film dengan aktor '{nama_aktor}'"

    df_sorted = df_filter.sort_values(by="rating", ascending=False).head(top_n)

    hasil = []
    for _, row in df_sorted.iterrows():
        hasil.append(
            f"{row['title']} | Rating: {row['rating']} | Tahun: {row['release_year']}"
        )

    return f"\nRekomendasi film dengan aktor '{nama_aktor}':\n- " + "\n- ".join(hasil)

In [None]:
def extract_actor_name(text):
    """
    Mencoba mengekstrak nama aktor dari input user.
    Karena user biasanya menyebut satu nama unik seperti 'pevita', 'raditya dika', dll.
    """
    text = text.lower()

    # daftar aktor dari dataset
    actors_list = []
    for a in df['actors']:
        if isinstance(a, list):
            actors_list.extend([x.lower() for x in a])

    # cari apakah ada nama aktor muncul di kalimat
    for actor in set(actors_list):
        first_name = actor.split()[0]
        if first_name in text:
            return actor

    return None


In [None]:
print(rekomendasi_aktor("Tom Cruise"))


Rekomendasi film dengan aktor 'tom_cruise':
- Edge of Tomorrow | Rating: 8.6 | Tahun: 2014.0
- The Mummy | Rating: 8.5 | Tahun: 2017.0
- The Mummy | Rating: 8.5 | Tahun: 2017.0
- Collateral | Rating: 8.5 | Tahun: 2004.0
- Space Station 3D | Rating: 8.4 | Tahun: 2002.0
- Vanilla Sky | Rating: 8.2 | Tahun: 2001.0
- The Last Samurai | Rating: 8.2 | Tahun: 2003.0
- War of the Worlds | Rating: 8.1 | Tahun: 2005.0
- Mission: Impossible III | Rating: 8.0 | Tahun: 2006.0
- Mission: Impossible II | Rating: 7.9 | Tahun: 2000.0


In [None]:
pickle.dump((tfidf_matrix, cosine_sim, df), open("recom_movie.pkl", "wb"))

In [None]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Enter your Gemini API key: ")

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

Enter your Gemini API key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


## Architecture Update: LangGraph StateGraph

This notebook has been **upgraded** to use the modern **LangGraph StateGraph** architecture instead of the legacy LangChain ReAct agent.

### Key Changes:

#### 1. Model Upgrade
- **OLD:** `gemini-2.0-flash` (Gemini 2.0)
- **NEW:** `gemini-2.5-flash` (Gemini 2.5 Flash)
- Added: `thinking_budget=0` and `include_thoughts=False` for cleaner responses

#### 2. Agent Architecture
- **OLD:** `initialize_agent()` with `AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION`
- **NEW:** LangGraph `StateGraph` with explicit control flow

#### 3. Invoke Pattern
- **OLD:** `agent.invoke({"input": query})`
- **NEW:** `agent.invoke({"messages": [HumanMessage(content=query)]}, config={"configurable": {"thread_id": "..."}})` 

### Benefits of LangGraph StateGraph:

1. **Modern Architecture** - LangGraph is the future of LangChain agents
2. **Built-in Memory** - Conversation memory via `MemorySaver` checkpointer
3. **Explicit Control Flow** - Clear graph with nodes (agent, tools) and edges
4. **Better Debugging** - Each step is explicit and traceable
5. **Thread Management** - Support for multiple conversation threads
6. **Extensibility** - Easy to add new nodes/tools to the graph

### Agent Flow:

```
START ‚Üí Agent Node (LLM with tools)
           ‚Üì
        Tool calls? 
           ‚Üì
        YES ‚Üí Tools Node ‚Üí Agent Node (loop)
           ‚Üì
        NO ‚Üí END
```

### Code Structure:

1. **LLM Initialization**: `ChatGoogleGenerativeAI` with Gemini 2.5 Flash
2. **Tools**: Three tools remain unchanged (`search_movie`, `recommend_movie`, `search_free`)
3. **Agent Building**: `build_agent()` function creates StateGraph workflow
4. **Invocation**: Message-based pattern with thread_id for memory

In [None]:
# 0 ‚Äî LOAD DATA / BUILD TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df["description"].fillna(""))
cosine_sim = cosine_similarity(tfidf_matrix)
pickle.dump((vectorizer, tfidf_matrix, cosine_sim, df), open("recom_movie.pkl", "wb"))
vectorizer, tfidf_matrix, cosine_sim, df = pickle.load(open("recom_movie.pkl", "rb"))


# BUILD INDEX ‚Äî WAJIB
df["title_clean"] = (
    df["title"]
    .astype(str)
    .str.lower()
    .str.replace(r"[^a-z0-9]", "", regex=True)
    .str.strip()
)

indices = pd.Series(df.index, index=df["title_clean"]).drop_duplicates()


# INITIALIZE GEMINI 2.5 FLASH LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,
    api_key=os.environ["GOOGLE_API_KEY"],
    thinking_budget=0,
    include_thoughts=False
)

print("LLM aktif: Gemini 2.5 Flash (ChatGoogleGenerativeAI)")


# SYSTEM PROMPT
SYSTEM_PROMPT = """
Kamu adalah chatbot khusus FILM.
Hanya jawab pertanyaan seputar film dalam dataset.
Jika user bertanya di luar film: TOLAK dengan sopan.

Tugasmu:
- Jika user menyebut judul film apapun yang ada di dataset:
   ‚Üí SELALU panggil tool search_movie(judul)
   ‚Üí Tampilkan info lengkap dengan format:
        üé¨ Judul:
        üìñ Deskripsi:
        üé≠ Genre:
        ‚≠ê Rating:
        üé¨ Sutradara:
        üë• Aktor:
        ‚è≥ Durasi:
        üìÖ Tahun:
- Jika user meminta "mirip", "similar", "rekomendasi", "yang seperti ...":
   ‚Üí Setelah memanggil search_movie(judul),
     WAJIB panggil recommend_movie(judul)
   ‚Üí Tampilkan daftar rekomendasi dengan format:
        - Judul (Tahun) | Genre | Rating | Durasi | Skor kemiripan
- Jika user bertanya tentang tahun, aktor, sutradara, genre, rating, durasi, atau kata kunci apapun ‚Üí WAJIB panggil tool search_free.
- Jika user meminta rekomendasi tanpa judul tertentu ‚Üí gunakan data di dataset untuk memberi jawaban terbaik.
- Jika user bertanya tentang film tapi tidak menyebut judul:
   ‚Üí Gunakan search_free.
- Jika user menjawab "boleh", "bolehh", "lanjut", "oke", "iya", "ya", "y", atau hal serupa:
‚Üí Berikan informasi lanjutan berdasarkan konteks sebelumnya.
- Semua jawaban WAJIB dalam bahasa Indonesia.

ATURAN FORMAT WAJIB:
- Semua output FINAL ANSWER HARUS menggunakan format multiline.
- Jangan pernah menggabungkan semua konten dalam satu baris.
- Gunakan line-break \n dan pastikan tetap dipertahankan.
- Format jawaban ketika memberikan info film:
üé¨ Judul:
üìñ Deskripsi:
üé≠ Genre:
‚≠ê Rating:
üé¨ Sutradara:
üë• Aktor:
‚è≥ Durasi:
üìÖ Tahun:

CATATAN:
- Setelah search_movie, kamu HARUS menampilkan hasilnya sebelum jawaban lainnya.
- Untuk pertanyaan apa pun yang mengandung judul film (misalnya "film ini bagus tidak?", "berapa rating 5 cm?", "siapa pemeran di milea", dsb), tetap WAJIB tampilkan info film lengkap terlebih dahulu berdasarkan search_movie.
"""

NON_FILM_KEYWORDS = ["presiden", "politik", "agama", "integral", "anjing", "kucing", "cuaca"]


# TOOL: SEARCH FILM
@tool
def search_movie(title: str):
    """Mencari detail film berdasarkan judul."""
    # Normalisasi judul (lebih toleran)
    clean = re.sub(r"[^a-z0-9]", "", str(title).lower()).strip()

    # Cari exact match
    if clean in indices:
        idx = indices[clean]
    else:
        # Fuzzy fallback (hindari salah total)
        matches = [k for k in indices.index if clean in k]
        if matches:
            idx = indices[matches[0]]
        else:
            return {"error": f"Film '{title}' tidak ditemukan."}

    row = df.iloc[idx]

    return {
        "Detail film": row.get("title"),
        "Deskripsi": row.get("description"),
        "Tahun Rilis": row.get("release_year"),
        "Genre": row.get("genres_list"),
        "Rating": row.get("rating"),
        "Sutradara": row.get("directors"),
        "Aktor": row.get("actors"),
        "Durasi": row.get("runtime_minutes")
    }



# TOOL: REKOMENDASI FILM MIRIP
@tool
def recommend_movie(title: str):
    """Memberi rekomendasi film mirip berdasarkan judul, beserta info lengkapnya."""
    t = re.sub(r"[^a-z0-9]", "", title.lower()).strip()

    if t not in indices:
        return {"error": f"Film '{title}' tidak ditemukan."}

    idx = indices[t]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:8]

    rec = []
    for i, score in sim_scores:
        row = df.iloc[i]

        rec.append({
            "Detail film": row.get("title"),
            "Deskripsi": row.get("description", None),
            "Tahun Rilis": row.get("release_year", None),
            "Deskripsi": row.get("description", None),
            "Genre": row.get("genres_list", None),
            "Rating": row.get("rating", None),
            "Sutradara": row.get("directors", None),
            "Aktor": row.get("actors", None),
            "Durasi": row.get("runtime_minutes", None),
            "similarity": float(score)
        })

    return {"recommendations": rec}


# TOOL: SEARCH BEBAS (ARTIS / SUTRADARA / SINOPSIS / DURASI)
@tool
def search_free(query: str = ""):
    """
    Pencarian bebas berdasarkan teks user:
    - rating tertinggi
    - rating terendah
    - rating tertinggi dari aktor tertentu
    - film berdasarkan aktor/sutradara/genre/tahun
    """
    q = str(query).lower().strip()

    # ==========================
    # 1 ‚Äî RATING TERTINGGI (GLOBAL)
    # ==========================
    if "rating tertinggi" in q or "rating tinggi" in q or "paling bagus" in q:
    # cek apakah user menyebut AKTOR
       aktor = extract_actor_name(q)
       if aktor:
           hasil = get_actor_movies_top(aktor, limit=5)
           if hasil.empty:
               return [{"error": f"Tidak ada film dengan aktor '{aktor}' di dataset."}]
           return hasil.to_dict(orient="records")

        # global top rating
       hasil = get_top_rated_movies(limit=5)
       return hasil.to_dict(orient="records")


    # 2 ‚Äî RATING TERENDAH
    # ==========================
    if "rating terendah" in q or "rating rendah" in q or "paling jelek" in q:
        hasil = get_low_rated_movies(limit=5)
        return hasil.to_dict(orient="records")

    # 3 ‚Äî RATING ANGKA (HANYA JIKA USER JELAS)
    if q.startswith("rating ") or q.startswith("ratingnya ") or "rating " in q:
        angka = re.findall(r"\b(10|[0-9])\b", q)
        if angka:
            angka = float(angka[0])
            subset = get_movies_by_rating(angka)
            if subset.empty:
                return [{"error": f"Tidak ada film dengan rating {angka} di dataset."}]
            return subset.to_dict(orient="records")

    # 4 ‚Äî AKTOR
    if "aktor" in q or "pemeran" in q:
        nama = extract_actor_name(q)
        subset = get_actor_movies_top(nama, limit=5)
        if subset.empty:
            return [{"error": f"Tidak ada film dengan aktor '{nama}' di dataset."}]
        return subset.to_dict(orient="records")

    # 5 ‚Äî SUTRADARA
    if "sutradara" in q or "director" in q:
        name = q.replace("sutradara", "").replace("director", "").strip()
        subset = df[df["directors"].astype(str).str.lower().str.contains(name)]
        if subset.empty:
            return [{"error": f"Tidak ada film dengan sutradara '{name}'."}]
        subset["rating_num"] = pd.to_numeric(subset["rating"], errors="coerce").fillna(0.0)
        subset = subset.sort_values("rating_num", ascending=False).head(5)
        return subset.to_dict(orient="records")

    # 6 ‚Äî GENRE
    genres = ["action", "horror", "drama", "comedy", "thriller", "romance"]
    for g in genres:
        if g in q:
            subset = df[df["genres_list"].astype(str).str.lower().str.contains(g)]
            if subset.empty:
                return [{"error": f"Tidak ada film dengan genre '{g}'."}]
            subset["rating_num"] = pd.to_numeric(subset["rating"], errors="coerce").fillna(0.0)
            subset = subset.sort_values("rating_num", ascending=False).head(5)
            return subset.to_dict(orient="records")

    # 7 ‚Äî TAHUN
    year_match = re.search(r"\b(19|20)\d{2}\b", q)
    if year_match:
        yr = int(year_match.group(0))
        subset = df[df["release_year"].astype(int) == yr]
        if not subset.empty:
            return subset.to_dict(orient="records")

    # 8 ‚Äî JUDUL MENGANDUNG QUERY
    subset = df[df["title"].astype(str).str.lower().str.contains(q)]
    if not subset.empty:
        return subset.to_dict(orient="records")

    return [{"error": "Tidak ada film yang cocok dengan query."}]


# BUILD LANGGRAPH AGENT
def build_agent(tools, llm, system_prompt):
    """Build LangGraph agent workflow using StateGraph"""
    
    class AgentState(dict):
        messages: list
    
    tool_node = ToolNode(tools)
    
    def call_llm(state: AgentState):
        """Call LLM with system prompt and messages"""
        messages = state["messages"]
        
        # Add system prompt if first message
        if len(messages) == 0 or not isinstance(messages[0], SystemMessage):
            messages = [SystemMessage(content=system_prompt)] + messages
        
        # Bind tools to LLM
        llm_with_tools = llm.bind_tools(tools)
        response = llm_with_tools.invoke(messages)
        
        return {"messages": [response]}
    
    def should_continue(state: AgentState):
        """Check if we should continue or end"""
        messages = state["messages"]
        last_message = messages[-1]
        
        # If LLM makes a tool call, continue to tools
        if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
            return "tools"
        # Otherwise end
        return END
    
    # Build graph
    workflow = StateGraph(MessagesState)
    
    # Add nodes
    workflow.add_node("agent", call_llm)
    workflow.add_node("tools", tool_node)
    
    # Set entry point
    workflow.add_edge(START, "agent")
    
    # Add conditional edges
    workflow.add_conditional_edges("agent", should_continue, ["tools", END])
    workflow.add_edge("tools", "agent")
    
    # Compile with memory
    memory = MemorySaver()
    app = workflow.compile(checkpointer=memory)
    
    return app


# CREATE TOOLS LIST AND BUILD AGENT
tools = [search_movie, recommend_movie, search_free]
agent = build_agent(tools, llm, SYSTEM_PROMPT)

print("‚úÖ LangGraph Agent berhasil dibuild!")


# FUNGSI CHAT (CONTEXT AWARE)
last_query = ""

def ask_bot(user_msg):
    global last_query

    if any(x in user_msg.lower() for x in NON_FILM_KEYWORDS):
        return "Maaf, saya hanya bisa menjawab tentang film."

    # Jika user bilang "boleh/lanjut", gunakan context sebelumnya
    if user_msg.lower().strip() in ["boleh", "bolehh", "ya", "iya", "lanjut", "oke", "y", "lanjut"]:
        user_msg = last_query

    last_query = user_msg

    # Invoke agent with LangGraph pattern (messages + thread_id)
    result = agent.invoke(
        {"messages": [HumanMessage(content=user_msg)]},
        config={"configurable": {"thread_id": "test_thread"}}
    )

    # Extract response from last message
    last_message = result["messages"][-1]
    response = last_message.content

    return response


# LOOP UTAMA
print("üé¨ Chatbot Film siap! (Architecture: LangGraph StateGraph)")

while True:
    msg = input("You: ")
    if msg.lower() == "exit":
        break
    print("Bot:", ask_bot(msg))