In [67]:
import pandas as pd

In [68]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df = df[df['Origin/Ethnicity'] == 'Bollywood']
df = df[df['Release Year'] >= 2013].sample(10)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
27045,2017,Haraamkhor,Bollywood,Shlok Sharma,Nawazuddin Siddiqui & Shweta Tripathi,comedy/crime,https://en.wikipedia.org/wiki/Haraamkhor,Shyam is a teacher at a small school in a tiny...
26934,2013,Boss,Bollywood,Anthony D'Souza,"Akshay Kumar, Mithun Chakraborty, Danny Denzon...",masala,https://en.wikipedia.org/wiki/Boss_(2013_Hindi...,The film starts with school teacher Satyakant ...
26969,2014,Gang of Ghosts,Bollywood,Satish Kaushik,"Parambrata Chatterjee, Sharman Joshi, Mahie Gi...",horror comedy,https://en.wikipedia.org/wiki/Gang_of_Ghosts,A story about a group of ghosts who have lost ...
27119,2017,Shaadi Mein Zaroor Aana,Bollywood,Ratnaa Sinha,"Rajkummar Rao, Kriti Kharbanda",romance/drama,https://en.wikipedia.org/wiki/Shaadi_Mein_Zaro...,"Satyendra ""Sattu"" Mishra (Rajkummar Rao) and A..."
26939,2013,Krrish 3,Bollywood,Rakesh Roshan,"Hrithik Roshan, Priyanka Chopra, Kangana Ranau...",superhero,https://en.wikipedia.org/wiki/Krrish_3,Scientist Rohit Mehra lives with his son Krish...


In [69]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [70]:
plots = df['Plot'].tolist()
titles = df['Title'].tolist()

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(plots)

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(plots)

2025-11-29 14:14:54,327 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
2025-11-29 14:14:56,994 - BERTopic - Embedding - Completed ✓
2025-11-29 14:14:57,002 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-29 14:14:57,293 - BERTopic - Dimensionality - Completed ✓
2025-11-29 14:14:57,295 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-29 14:14:57,311 - BERTopic - Cluster - Completed ✓
2025-11-29 14:14:57,319 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-29 14:14:57,342 - BERTopic - Representation - Completed ✓


In [71]:
# 4. Build the Graph
G = nx.Graph()

movie_info = df[['Title', 'Release Year','Plot']].to_dict('records')

# Add Nodes
for i, title in enumerate(titles):
    current_movie = movie_info[i]
    movie_title = current_movie['Title']
    movie_year = current_movie['Release Year']
    movie_plot = current_movie['Plot']
    
    # 2. Format the Label: Set the string that PyVis will display
    node_label = f"{movie_title}\n({movie_year})"
    node_tooltip = f"{movie_title} ({movie_year})\n{movie_plot}"
    
    # 3. Add the Node with the 'label' attribute
    G.add_node(i, 
               label=node_label,  # <-- THIS IS THE KEY CHANGE
               title=node_tooltip, # title is used for hover text
               topic=topics[i], 
               group=topics[i])

# Add Edges (k-Nearest Neighbors)
sim_matrix = cosine_similarity(embeddings)

for i in range(len(sim_matrix)):
    # Get indices of top 5 similar movies
    top_indices = sim_matrix[i].argsort()[-3:-1] 
    
    for j in top_indices:
        source_id = int(i)
        target_id = int(j)
        weight = float(sim_matrix[i][j])
        
        if weight > 0.5:
            G.add_edge(source_id, target_id, value=weight)

# 5. Export for Visualization
from pyvis.network import Network

# Initialize the PyVis network
net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

# Translating the NetworkX graph to PyVis
net.from_nx(G)

net.set_options("""
var options = {
  "edges": {
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "interaction": {
    "hover": true,
    "navigationButtons": true
  },
  "manipulation": {
    "enabled": false
  },
  "physics": {
    "enabled": true,
    "barnesHut": {
      "gravitationalConstant": -15000,
      "centralGravity": 0.3,
      "springLength": 100,
      "springConstant": 0.05
    }
  }
}
""")

net.html += """
<script type="text/javascript">
  var network = document.getElementById('mynetwork');
  network.on('selectNode', function (params) {
    if (params.nodes.length === 1) {
      var selectedNodeId = params.nodes[0];
      var nodes = network.getPositions();
      
      // Optional: Fix the selected node's position to prevent movement
      network.body.data.nodes.update({id: selectedNodeId, fixed: true});
      
      // Vis.js automatically handles the default highlighting of neighbors and edges.
      // If you need custom styling (e.g., thicker edges), you would modify 
      // the network.getConnectedEdges(selectedNodeId) here.
    }
  });
  network.on('deselectNode', function (params) {
    if (params.previousSelection.nodes.length === 1) {
      var previouslySelectedNodeId = params.previousSelection.nodes[0];
      // Optional: Unfix the node when deselected
      network.body.data.nodes.update({id: previouslySelectedNodeId, fixed: false});
    }
  });
</script>
"""

# Save the file
net.save_graph("movie_graph.html")

In [72]:
# Create edges ONLY between nodes that share the same Topic ID
# This guarantees clear separation (islands) corresponding to BERTopic's output.

G_topic = nx.Graph()

# Add all nodes first
for i, title in enumerate(titles):
    G_topic.add_node(i, 
                     label=f"{titles[i]}", 
                     topic=topics[i], 
                     group=topics[i])

# Group movies by Topic ID
movies_by_topic = {}
for i, topic_id in enumerate(topics):
    movies_by_topic.setdefault(topic_id, []).append(i)

# Connect movies within the same topic
for topic_id, movie_indices in movies_by_topic.items():
    if topic_id == -1: # Skip outliers
        continue
    
    # Fully connect all movies within the same topic (making a dense island)
    for i in range(len(movie_indices)):
        for j in range(i + 1, len(movie_indices)):
            node_u = movie_indices[i]
            node_v = movie_indices[j]
            # Use a high, consistent weight for visual effect
            G_topic.add_edge(node_u, node_v, value=1.0) 

net.from_nx(G_topic)

net.save_graph("movie_graph.html")


In [73]:
df.query('Title.str.startswith("Iron Man")', engine='python')

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
