In [1]:
# import the libraries
import gc
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [2]:
# Download the data

!rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz

In [3]:
# Read relevant data from the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=True).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [4]:
title = title[title['titleType'] == 'movie']
title['startYear'] = pd.to_numeric(title['startYear'], errors='coerce')
title = title[title['startYear'] >= 2005]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0011801,movie,Tötet nicht mehr,2019.0
tt0013274,movie,Istoriya grazhdanskoy voyny,2021.0
tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,2020.0
tt0067683,movie,Workers '71: Nothing About Us Without Us,2006.0
tt0069049,movie,The Other Side of the Wind,2018.0


In [5]:
# load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]
cast.head()

Unnamed: 0,tconst,nconst,category
0,tt0000001,nm1588970,self
1,tt0000001,nm0005690,director
2,tt0000001,nm0374658,cinematographer
3,tt0000002,nm0721526,director
4,tt0000002,nm1335271,composer


In [6]:
# Only consider actors, not directors, composers, etc.
actors = cast
actors = actors[actors.category.isin({'actor'})]

In [7]:
actors = actors[actors['tconst'].isin(title.index)]
# this is what the network looks like
actors.head()

Unnamed: 0,tconst,nconst,category
80694,tt0011801,nm0459029,actor
80695,tt0011801,nm0681726,actor
80697,tt0011801,nm0726256,actor
80698,tt0011801,nm0776458,actor
80701,tt0011801,nm0666006,actor


In [8]:
# Delete the original data to save memory
del cast

In [9]:
region = pd.read_csv('title.akas.tsv.gz', sep='\t').set_index('titleId')['region']
region.value_counts().head(20)

  exec(code_obj, self.user_global_ns, self.user_ns)


JP     4032515
FR     4031409
DE     4028702
IN     3964239
ES     3953998
IT     3934567
PT     3869794
\N     1868267
US     1365969
GB      423974
CA      210283
XWW     166491
AU      122825
BR      113514
MX       91626
RU       91392
GR       90809
FI       85675
PL       84688
HU       72332
Name: region, dtype: int64

In [None]:
name = pd.read_csv('name.basics.tsv.gz', sep='\t').set_index('nconst')[['primaryName', 'birthYear']]

In [None]:
def get_pairs(lang="en", min_acted=25, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = actors
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
        # graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs, cat = get_pairs(lang = 'IN', min_acted = 10, min_pairings = 3)

In [1]:
pairs

NameError: ignored

In [None]:
cat

In [None]:
forkumu = lookup(pairs, cat)
forkumu

In [None]:
forkumu = forkumu[['name1', 'name2', 'count']]
forkumu = forkumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})
forkumu

In [None]:
forkumu.to_excel("pairs.xlsx", index = False)