In [1]:
# import the libraries
import gc
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [2]:
# Download the data

!rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz

In [3]:
# Read relevant data from the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=True).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
title = title[title['titleType'] == 'movie']
title['startYear'] = pd.to_numeric(title['startYear'], errors='coerce')
title = title[title['startYear'] >= 2005]

In [5]:
# load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]

In [6]:
# Only consider actors, not directors, composers, etc.
actors = cast
actors = actors[actors.category.isin({'actor'})]

In [7]:
actors = actors[actors['tconst'].isin(title.index)]
# this is what the network looks like
# actors.head(5)

In [8]:
# Delete the original data to save memory
del cast

In [9]:
region = pd.read_csv('title.akas.tsv.gz', sep='\t').set_index('titleId')['region']
# region.value_counts().head(20)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
name = pd.read_csv('name.basics.tsv.gz', sep='\t').set_index('nconst')[['primaryName', 'birthYear']]

In [11]:
def get_pairs(lang="en", min_acted=25, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = actors
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
        # graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [14]:
pairs, cat = get_pairs(lang = 'IN', min_acted = 10, min_pairings = 3)

In [None]:
forkumu = lookup(pairs, cat)

In [None]:
forkumu = forkumu[['name1', 'name2', 'count']]
forkumu = forkumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})


In [None]:
forkumu.to_excel("pairs.xlsx", index = False)