# Actor Collaboration Network

In [1]:
import csv
import pandas as pd
from collections import Counter
from itertools import permutations
from itertools import combinations
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
import networkx as nx

In [2]:
file_movies = 'title.basics.tsv/title.basics.tsv'
movies = pd.read_csv(file_movies, sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'titleType': 'str',
    'primaryTitle': 'str',
    'startYear': 'Int64',
}, usecols=['tconst', 'titleType', 'primaryTitle', 'startYear']).set_index('tconst')

In [3]:
movies = movies[(movies['startYear'] >= 2003) & (movies['titleType'] == 'movie')] #try 1983
del movies['titleType']

In [4]:
file_akas = 'title.akas.tsv/title.akas.tsv'
akas_data = pd.read_csv(file_akas, sep='\t', na_values='\\N', dtype={
    'titleId': 'str',
    'ordering': 'int',
    'title': 'str',
    'region': 'str',
    'language': 'str',
    'types': 'str',
    'attributes': 'str',
})

In [5]:
#filter the regions
allowed_regions = ['IT']
filtered_akas_data = akas_data[(akas_data['region'].isin(allowed_regions)) & (akas_data['ordering'] == 1)]
unique_title_ids = filtered_akas_data['titleId'].unique()
final_movies = movies[movies.index.isin(unique_title_ids)]

In [6]:
file_cast = 'title.principals.tsv/title.principal.tsv'
cast = pd.read_csv(file_cast, sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'nconst': 'str',
    'category': 'str',
}, usecols=['tconst', 'nconst', 'category'])

In [7]:
cast = cast[cast.category.isin({'actor', 'actress'}) & cast['tconst'].isin(movies.index)]
cast.reset_index(drop=True, inplace=True)

In [8]:
#we create a bipartite network:
B = nx.Graph()

actors = cast['nconst'].unique()
movies = movies.index.unique()

B.add_nodes_from(actors, bipartite=0)  # Actors belong to set 0
B.add_nodes_from(movies, bipartite=1)  # Movies belong to set 1


for _, row in cast.iterrows():
    actor = row['nconst']
    movie = row['tconst']
    B.add_edge(actor, movie)

In [9]:
print("Number of Actors:", len(actors))
print("Number of Movies:", len(movies))

Number of Actors: 459478
Number of Movies: 293353


In [10]:
def collaboration_weight(G, u, v):
    common_neighbors = list(nx.common_neighbors(G, u, v))
    return len(common_neighbors)

proj_actors= bipartite.generic_weighted_projected_graph(B, actors, weight_function=collaboration_weight)

In [11]:
nx.is_bipartite(proj_actors)

False

In [12]:
#we create a weighted collaboration network 
#other=bipartite.weighted_projected_graph(B,actors)
#nx.is_connected(other)

In [13]:
nx.is_connected(proj_actors)

False

In [14]:
conn_comp=[len(c) for c in sorted(nx.connected_components(proj_actors), key=len, reverse=True)]

In [15]:
largest_cc = max(nx.connected_components(proj_actors), key=len)

In [16]:
print("Number of Actors:", len(largest_cc))

Number of Actors: 314331


In [17]:
A = proj_actors.subgraph(largest_cc)

In [18]:
nx.is_connected(A)

True

In [19]:
initial_graph=A.copy()
nx.is_connected(initial_graph)

True

In [None]:
import time
start = time.time()
diameter = nx.diameter(A)
end = time.time()

# Centrality measures

In [None]:
#degree centrality 
D = nx.degree_centrality(A)
degree_sort = sorted(D.items(), key=lambda item:item[1], reverse=True)

In [None]:
import time

start = time.time()
BC = nx.betweenness_centrality(A, k=1000) 
end = time.time()
print("Time taken =", end-start, "seconds")
btw_sort= sorted(BC.items(), key=lambda item: item[1], reverse=True)

In [None]:
V = [btw_sort[i][0] for i in range(len(btw_sort))]
CC = {}
start = time.time()
for i in range(1000):
    CC[V[i]] = nx.closeness_centrality(A, V[i])
end = time.time()
print("Time taken =", end-start, "seconds")
#L = sorted(D.items(), key=lambda item: item[1], reverse=True)

## Network perturbation

In [None]:
import random 

sorted_nodes=sorted(D.items(),key=lambda x: x[1], reverse=True)
top_nodes = [node for node, centrality in sorted_nodes[:10000]]

actors_remove_top = proj_actors.copy()
actors_remove_top.remove_nodes_from(top_nodes)

In [None]:
top_cc = max(nx.connected_components(actors_remove_top), key=len)
print("Number of Actors:", len(top_cc))

In [None]:
print("Number of Actors:", len(top_cc))
print(len()len(top_cc))

In [None]:
random_nodes=random.sample(proj_actors.nodes(), 10000)

actors_remove_random = proj_actors.copy()
actors_remove_random.remove_nodes_from(random_nodes)

In [None]:
random_cc = max(nx.connected_components(actors_remove_random), key=len)

In [None]:
print("Number of Actors:", len(top_cc))
print(len(random_cc))

In [None]:
# Calculate the degree of each node

top_cc_graph = actors_remove_top.subgraph(top_cc)
node_degrees = dict(top_cc_graph.degree())
node_degrees_A = dict(A.degree())

# Calculate the average degree connectivity
average_degree_connectivity = nx.average_degree_connectivity(top_cc_graph)
average_degree_connectivity_A = nx.average_degree_connectivity(A)

# Extract the degrees and corresponding average degree connectivity
degrees = list(node_degrees.values())
avg_deg_connectivity = [average_degree_connectivity[deg] for deg in degrees]


degrees_A = list(node_degrees_A.values())
avg_deg_connectivity_A = [average_degree_connectivity_A[deg] for deg in degrees_A]

# Plot the connectivity (degree) of each node vs. average degree connectivity
plt.scatter(degrees, avg_deg_connectivity, alpha=0.5)
plt.scatter(degrees_A, avg_deg_connectivity_A, alpha=0.5)
plt.xlabel("Node Degree (Connectivity)")
plt.ylabel("Average Degree Connectivity")
plt.title("Node Degree vs. Average Degree Connectivity")
plt.grid(True)
plt.show()

In [None]:

# Plot the connectivity (degree) of each node vs. average degree connectivity
plt.scatter(degrees, avg_deg_connectivity, alpha=0.5)
plt.scatter(degrees_A, avg_deg_connectivity_A, alpha=0.5)
plt.xlabel("Node Degree (Connectivity)")
plt.ylabel("Average Degree Connectivity")
plt.title("Node Degree vs. Average Degree Connectivity")
plt.grid(True)
plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import random

# Create your initial graph (A) here or load it

# Initialize lists to store results
fraction_removed = []
diameters = []

# Set the number of nodes to remove at each iteration
nodes_to_remove = 50000


# Get the initial number of nodes in the graph
initial_node_count = len(initial_graph.nodes())


# Loop to iteratively remove nodes and compute diameter
while len(initial_graph.nodes()) > nodes_to_remove:
    # Randomly sample nodes to remove
    nodes_to_remove_list = random.sample(initial_graph.nodes(), nodes_to_remove)
    
    # Remove the sampled nodes
    
    new_largest_cc = max(nx.connected_components(initial_graph.remove_nodes_from(nodes_to_remove_list)), key=len)
    new_graph=initial_graph.remove_nodes_from(nodes_to_remove_list).subgraph(new_largest_cc)
    
    # Compute the diameter of the remaining graph
    diameter = nx.diameter(initial_graph)
    
    # Calculate the fraction of removed nodes
    removed_fraction = (initial_node_count - len(initial_graph.nodes())) / initial_node_count
    
    # Append results to lists
    fraction_removed.append(removed_fraction)
    diameters.append(diameter)

# Plot the changes in diameter as a function of the fraction of removed nodes
plt.figure(figsize=(10, 6))
plt.plot(fraction_removed, diameters, marker='o', linestyle='-', color='b')
plt.xlabel('Fraction of Removed Nodes')
plt.ylabel('Diameter')
plt.title('Changes in Diameter vs. Fraction of Removed Nodes')
plt.grid(True)
plt.show()