In [None]:
import pandas as pd # pandas helps with data manipulation
import numpy as np # numpy helps does random number stuff
import kagglehub # for downloading datasets from kaggle
import os # for file path stuff

In [None]:
# Download latest version
path = kagglehub.dataset_download("gsimonx37/letterboxd")

print("Path to dataset files:", path)

Path to dataset files: /Users/minicarter/.cache/kagglehub/datasets/gsimonx37/letterboxd/versions/2


In [None]:
# Use the existing 'path' variable from kagglehub
csv_files = [file for file in os.listdir(path) if file.endswith(".csv")]

# Read and display the head of each CSV file
for file in csv_files:
    file_path = os.path.join(path, file)
    print(f"\nHead of {file}:")
    try:
        df = pd.read_csv(file_path)
        print(df.head())
    except Exception as e:
        print(f"Error reading {file}: {e}")


Head of posters.csv:
        id                                               link
0  1000001  https://a.ltrbxd.com/resized/film-poster/2/7/7...
1  1000002  https://a.ltrbxd.com/resized/film-poster/4/2/6...
2  1000003  https://a.ltrbxd.com/resized/film-poster/4/7/4...
3  1000004  https://a.ltrbxd.com/resized/film-poster/5/1/5...
4  1000005  https://a.ltrbxd.com/resized/film-poster/2/4/0...

Head of countries.csv:
        id      country
0  1000001           UK
1  1000001          USA
2  1000002  South Korea
3  1000003          USA
4  1000004      Germany

Head of releases.csv:
        id    country        date        type rating
0  1000001    Andorra  2023-07-21  Theatrical    NaN
1  1000001  Argentina  2023-07-20  Theatrical    ATP
2  1000001  Australia  2023-07-19  Theatrical     PG
3  1000001  Australia  2023-10-01     Digital     PG
4  1000001    Austria  2023-07-20  Theatrical    NaN

Head of genres.csv:
        id      genre
0  1000001     Comedy
1  1000001  Adventure
2  1000002

In [None]:
# putting data in dictionary for easier access 
letterboxd_data = {}

for file in csv_files:
    name = file.replace('.csv', '')
    df = pd.read_csv('/Users/minicarter/.cache/kagglehub/datasets/gsimonx37/letterboxd/versions/2/{}'.format(file))
    letterboxd_data[name] = df


In [13]:
# getting data out of dictionary for said easier access
movies = letterboxd_data['movies']
actors = letterboxd_data['actors']
crew = letterboxd_data['crew']
languages = letterboxd_data['languages']
studios = letterboxd_data['studios']
countries = letterboxd_data['countries']
genres = letterboxd_data['genres']
themes = letterboxd_data['themes']
releases = letterboxd_data['releases']

In [None]:
# merging studio information to movies so we can see sort by big 5 studios
studio_new = pd.merge(movies, studios, left_on='id', right_on='id', how='inner').groupby(['id']).agg({'studio': list}).reset_index() # this line groups studios by movie id so we can see all studios for a given movie
movies = pd.merge(movies, studio_new, left_on='id', right_on='id', how='inner')


Unnamed: 0,id,studio
0,1000001,"[LuckyChap Entertainment, Heyday Films, NB/GG ..."
1,1000002,[Barunson E&A]
2,1000003,"[IAC Films, AGBO, Ley Line Entertainment, Year..."
3,1000004,"[Fox 2000 Pictures, Regency Enterprises, The L..."
4,1000005,"[Summit Entertainment, Black Label Media, Gilb..."
...,...,...
438192,1941538,[Helsinki-filmi]
438193,1941539,[Onza Entertainment]
438194,1941541,[Enlight Media]
438195,1941557,[滚石]


In [None]:
# making a list of big five studios in the 1940s
bigfive = ["Warner Bros. Pictures", "Paramount Pictures", "20th Century Fox", "Metro-Goldwyn-Mayer", "RKO Radio Pictures"]

# creating a column that checks is the movie is from one of the big five studios
movies['bigfive'] = movies['studio'].apply(lambda x: any(studio in bigfive for studio in x))


movies_bigfive = movies[movies['bigfive']]

movies_df = movies_bigfive[movies_bigfive['date'].between(1941, 1947)]


In [56]:
movieCrew = pd.merge(movies_df, crew, left_on='id', right_on='id', how='inner')
movieActors = pd.merge(movies_df, actors, left_on='id', right_on='id', how='inner')
movieActors['role'] = "Actor"

workers = pd.concat([movieCrew, movieActors])

workers['person_id'] = workers.groupby(['name_y', 'role']).ngroup()
workers.head()



Unnamed: 0,id,name_x,date,tagline,description,minute,rating,studio,bigfive,role,name_y,person_id
0,1000361,Casablanca,1942.0,They had a date with fate in Casablanca!,"In Casablanca, Morocco in December 1941, a cyn...",102.0,4.27,[Warner Bros. Pictures],True,Director,Michael Curtiz,6819
1,1000361,Casablanca,1942.0,They had a date with fate in Casablanca!,"In Casablanca, Morocco in December 1941, a cyn...",102.0,4.27,[Warner Bros. Pictures],True,Producer,Hal B. Wallis,3735
2,1000361,Casablanca,1942.0,They had a date with fate in Casablanca!,"In Casablanca, Morocco in December 1941, a cyn...",102.0,4.27,[Warner Bros. Pictures],True,Writer,Julius J. Epstein,5578
3,1000361,Casablanca,1942.0,They had a date with fate in Casablanca!,"In Casablanca, Morocco in December 1941, a cyn...",102.0,4.27,[Warner Bros. Pictures],True,Writer,Philip G. Epstein,7598
4,1000361,Casablanca,1942.0,They had a date with fate in Casablanca!,"In Casablanca, Morocco in December 1941, a cyn...",102.0,4.27,[Warner Bros. Pictures],True,Writer,Howard Koch,4252


In [37]:
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations

collaborative

In [57]:
G = nx.Graph()

for movie, group in workers.groupby('id'):
    names = group['person_id'].to_list()
    for name1, name2 in combinations(names, 2):
        if G.has_edge(name1, name2):
            G[name1][name2]['weight'] += 1
        else:
            G.add_edge(name1, name2, weight=1)

In [58]:
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())


Nodes: 9723
Edges: 857949


In [69]:
workers_agg = workers.groupby('person_id').agg(
    name=('name_y', 'first'),
    role=('role','first'),
    films=('name_x', lambda x: list(zip(x, workers.loc[x.index, 'date']))),
    film_count=('name_x', 'count'),
    studios=('studio', list)).reset_index().head()

attr_dict = workers_agg.set_index('person_id').to_dict('index')
nx.set_node_attributes(G, attr_dict)

In [72]:
def graph_summary(G):
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
    print(f"Density: {nx.density(G):.4f}")
    print(f"Average degree: {sum(d for _, d in G.degree()) / G.number_of_nodes():.2f}")
    print(f"Connected components: {nx.number_connected_components(G)}")

graph_summary(G)





Nodes: 9723
Edges: 857949
Density: 0.0182
Average degree: 176.48
Connected components: 8


In [174]:
nodes_to_remove = []

for node in G.nodes():
    # Get all weights of edges connected to this node
    edge_weights = [d.get('weight', 1) for _, _, d in G.edges(node, data=True)]
    # Default to 1 if 'weight' key is missing

    # If every edge weight <= 1, mark for removal
    if all(w <= 1 for w in edge_weights):
        nodes_to_remove.append(node)

print(f"Removing {len(nodes_to_remove)} nodes with no strong connections")
G.remove_nodes_from(nodes_to_remove)

print(f"Remaining nodes: {G.number_of_nodes()}")
print(f"Remaining edges: {G.number_of_edges()}")

Removing 958 nodes with no strong connections
Remaining nodes: 1694
Remaining edges: 38915


In [175]:
from pyvis.network import Network

G_vis = G.copy()

net = Network(
    height="750px",
    width="100%",
    notebook=False,
    bgcolor="#222222",
    font_color="white",
    directed=False,
)

In [176]:
# Node size = degree
for node in G_vis.nodes():
    G_vis.nodes[node]['size'] = 5 + 2 * G_vis.degree(node)
    G_vis.nodes[node]['title'] = f"{node} — degree: {G_vis.degree(node)}"

# Edge width = weight
for u, v, d in G_vis.edges(data=True):
    # Default to 1 if weight missing
    d['width'] = d.get('weight', 1)
    d['title'] = f"Weight: {d.get('weight', 1)}"


In [177]:
net.barnes_hut(
    gravity=-20000,
    central_gravity=0.3,
    spring_length=200,   # base spring length
    spring_strength=0.05,
    damping=0.8,
    overlap=0
)

In [178]:
for u, v, d in G_vis.edges(data=True):
    weight = d.get('weight', 1)
    # shorter spring for stronger weight
    d['length'] = 200 / weight


In [179]:
net.from_nx(G_vis)


In [180]:
net.show_buttons(filter_=['physics'])  # allows you to tweak physics interactively


In [181]:

net.write_html("collaboration_network.html", open_browser=True)


In [None]:
# -----------------------------
# PyVis Collaboration Graph
# -----------------------------

from pyvis.network import Network
import networkx as nx

# Make a copy of your filtered graph
G_vis = G.copy()  # G should be your NetworkX graph

# Step 1: Remove weak edges (weight <= 1)
edges_to_remove = [(u, v) for u, v, d in G_vis.edges(data=True) if d.get('weight', 1) <= 1]
G_vis.remove_edges_from(edges_to_remove)

# Step 2: Remove nodes with no remaining edges
nodes_to_remove = [n for n, d in G_vis.degree() if d == 0]
G_vis.remove_nodes_from(nodes_to_remove)

print(f"Nodes remaining: {G_vis.number_of_nodes()}")
print(f"Edges remaining: {G_vis.number_of_edges()}")

# Step 3: Encode node size by degree and edge width by weight
for node in G_vis.nodes():
    deg = G_vis.degree(node)
    G_vis.nodes[node]['size'] = 5 + 2 * deg
    G_vis.nodes[node]['title'] = f"{node} — degree: {deg}"

for u, v, d in G_vis.edges(data=True):
    weight = d.get('weight', 1)
    d['width'] = 1 + weight       # edge thickness
    d['title'] = f"Weight: {weight}"
    d['length'] = 200 / weight    # shorter spring for stronger connections

# Step 4: Initialize PyVis network
net = Network(
    height="750px",
    width="100%",
    notebook=False,      # <-- important for external browser
    bgcolor="#222222",
    font_color="white"
)

# Optional: improve physics simulation for weighted layout
net.barnes_hut(
    gravity=-20000,
    central_gravity=0.3,
    spring_length=200,
    spring_strength=0.05,
    damping=0.8,
    overlap=0
)

# Step 5: Load graph into PyVis
net.from_nx(G_vis)

# Optional: show physics controls
net.show_buttons(filter_=['physics'])

# Step 6: Render in external browser
net.show("collaboration_network.html")


In [162]:

pos = nx.spring_layout(G_core, k=0.3, seed=42)  # compute layout once

for node, p in pos.items():
    G_core.nodes[node]['x'] = p[0]*1000  # scale for PyVis
    G_core.nodes[node]['y'] = p[1]*1000

In [None]:
net = Network(height="750px", width="100%", notebook=False, directed=False, bgcolor="#222222")
net.from_nx(G_core)

fast_collab_graph.html


AttributeError: 'NoneType' object has no attribute 'render'

In [165]:
net.write_html("fast_collab_graph.html", open_browser=True)


In [76]:
for n, d in G.nodes(data=True):
    for k, v in d.items():
        # If the value is a list, dict, or tuple, convert to string
        if isinstance(v, (list, dict, tuple)):
            d[k] = str(v)


nx.write_gexf(G, "collaboration_network.gexf")