In [1]:
import csv
import pandas as pd

# Actor Names

In [2]:
names = pd.read_csv('name.csv', usecols=[0,1,2,4], index_col=0,
                    names=['id', 'name', 'numeral', 'sex'],
                    dtype={'sex': str})

def swap_names(name):
    if ',' in name:
        last, first = name.split(',', 1)
        name = first.strip() + ' ' + last.strip()
    return name

names.name = names.name.apply(swap_names)

n = names.numeral.notnull() & (names.numeral != 'I')
names.loc[n, 'name'] = names.name[n] + ' (' + names.numeral[n] + ')'
del names['numeral']

In [3]:
print(len(names))
print(names.dtypes)
names.head()

5080773
name    object
sex     object
dtype: object


Unnamed: 0_level_0,name,sex
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3847,Jeremy Abe,m
2903,Mohamed AbdAllah,m
614,David A.,m
187,Esteban Rodriguez -Alverio,m
2132,Athar Abbas,m


In [4]:
names[names.name.str.startswith('George Clooney')]

Unnamed: 0_level_0,name,sex
id,Unnamed: 1_level_1,Unnamed: 2_level_1
359800,George Clooneye,m
359797,George Clooney,m
359798,George Clooney (II),m


In [5]:
names.sex.unique()

array(['m', 'f', nan], dtype=object)

# Movie Titles

In [6]:
titles = pd.read_csv('title.csv', usecols=[0,1,2,3,4], index_col=0,
                     names=['id', 'title', 'numeral', 'type', 'year'])

# 1 Feature film
# 2 TV series
# 3 TV movie
# 4 Adult film
# 5 (no rows match)
# 6 Video game
# 7 TV series episode

titles = titles[(titles.type == 1) & (titles.year.notnull())]
del titles['type']

n = titles.numeral.notnull() & (titles.numeral != 'I')
titles.title[n] = titles.title[n] + ' (' + titles.numeral[n] + ')'
del titles['numeral']

In [7]:
titles = titles.drop_duplicates()

In [8]:
print('{:,}'.format(len(titles)))
print(titles.dtypes)
titles.head()

808,623
title     object
year     float64
dtype: object


Unnamed: 0_level_0,title,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2161856,3orthographies,2013
2154763,11 Minutes,2014
2175735,A Lélek órása,1923
2172998,A Gypsy Girl's Love,1908
2185884,A Woman Scorned,1999


In [9]:
%%time
avoid_ids = set()
with open('movie_info.csv') as f:
    for row in csv.reader(f):
        if row[2] == '3' and row[3] in ('Adult', 'Short', 'Documentary'):
            avoid_ids.add(int(row[1]))
print(len(avoid_ids))

665300
CPU times: user 27.7 s, sys: 384 ms, total: 28.1 s
Wall time: 28.1 s


In [10]:
titles = titles.drop(titles.select(avoid_ids.__contains__).index)

In [11]:
print('{:,}'.format(len(titles)))
print(titles.dtypes)
titles.head()

306,705
title     object
year     float64
dtype: object


Unnamed: 0_level_0,title,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2175735,A Lélek órása,1923
2198464,Aizaugusa gravi viegli krist,1986
2196448,Agliyorum,1988
2153439,0_1_0,2008
2165482,97 fung lau mung,1994


# Character Names

In [12]:
characters = pd.read_csv('char_name.csv', usecols=[0,1], index_col=0,
                     names=['id', 'character'])
characters.sort_index()
print('{:,}'.format(len(characters)))

3,628,462


In [13]:
characters = characters.drop_duplicates()
print('{:,}'.format(len(characters)))

3,627,738


In [14]:
characters.head()

Unnamed: 0_level_0,character
id,Unnamed: 1_level_1
7340,U.S.S. Soldier
8564,Count Rood
94149,Conny De Vooght
83253,Dago
68546,Janitor B


# Which Actors Star In Which Movies

In [15]:
# From imdb/parser/sql/dbschema.py

for i, role_type_name in enumerate((
        None, 'actor', 'actress', 'producer', 'writer',
        'cinematographer', 'composer', 'costume designer',
        'director', 'editor', 'miscellaneous crew',
        'production designer', 'guest')):
    print(i, role_type_name)

0 None
1 actor
2 actress
3 producer
4 writer
5 cinematographer
6 composer
7 costume designer
8 director
9 editor
10 miscellaneous crew
11 production designer
12 guest


In [16]:
#%%timeit -n1 -r1

if 'raw_cast' in dir():
    del raw_cast

column_names = ['name_id', 'title_id', 'character_id', 'n', 'role_type']

raw_cast = pd.read_csv(
    'cast_info.csv', usecols=[1,2,3,5,6], names=column_names,
    dtype=dict.fromkeys(['name_id', 'title_id', 'title', 'role_type'], 'int32'))

print('{:,}'.format(len(raw_cast)))
print(raw_cast.dtypes)
raw_cast.head()

45,555,958
name_id           int32
title_id          int32
character_id    float64
n               float64
role_type         int32
dtype: object


Unnamed: 0,name_id,title_id,character_id,n,role_type
0,1,1272777,1.0,,1
1,2,2797886,1.0,25.0,1
2,2,2995545,2.0,22.0,1
3,3,2418021,,12.0,1
4,4,2159054,3.0,,1


In [17]:
if 'cast' in dir():
    del cast

# Other columns:
# 3  Role id, or 1 if they appeared as themselves
# 4  Notes like "(archive footage)" and "(uncredited)"
# 5  Order of actor/actress in billing
# 6  Role type (see role types in previous cell)

# Only keep rows for actors and actresses, in named roles.

cast = raw_cast.loc[
    ((raw_cast.role_type == 1) | (raw_cast.role_type == 2))
    & raw_cast.character_id.notnull()
    ].copy()

cast['type'] = cast.pop('role_type').map({1: 'actor', 2: 'actress'})

# Only keep rows that match our table of feature films.

print(cast.head())
cast = pd.merge(titles[['title', 'year']], cast,
                left_index=True, right_on='title_id', sort=False)
del cast['title_id']

cast = pd.merge(names[['name']], cast, left_index=True, right_on='name_id', sort=False)
del cast['name_id']

cast = pd.merge(characters[['character']], cast, left_index=True, right_on='character_id', sort=False)
del cast['character_id']

# Re-order columns

cast['year'] = cast['year'].astype('int32')

cast = cast[['title', 'year', 'name', 'type', 'character', 'n']]

print('{:,}'.format(len(cast)))
print(cast.dtypes)

   name_id  title_id  character_id   n   type
0        1   1272777             1 NaN  actor
1        2   2797886             1  25  actor
2        2   2995545             2  22  actor
4        4   2159054             3 NaN  actor
5        4   2159055             1 NaN  actor
3,768,347
title         object
year           int32
name          object
type          object
character     object
n            float64
dtype: object


In [18]:
cast.drop_duplicates().reindex().head()

Unnamed: 0,title,year,name,type,character,n
22434,The Core,2003,Alejandro Abellan,actor,U.S.S. Soldier,
3304875,Il momento di uccidere,1968,Remo De Angelis,actor,Dago,9.0
3473346,Across the Divide,1921,Thomas Delmar,actor,Dago,4.0
6648135,Revan,2012,Diego James,actor,Dago,
8250700,Un homme marche dans la ville,1950,Fabien Loris,actor,Dago,12.0


In [19]:
cast[cast.title == 'Star Wars'].sort('n')

Unnamed: 0,title,year,name,type,character,n
5673560,Star Wars,1977,Mark Hamill,actor,Luke Skywalker,1
4546784,Star Wars,1977,Harrison Ford,actor,Han Solo,2
18030328,Star Wars,1977,Carrie Fisher,actress,Princess Leia Organa,3
3090888,Star Wars,1977,Peter Cushing,actor,Grand Moff Tarkin,4
5506098,Star Wars,1977,Alec Guinness,actor,Ben Obi-Wan Kenobi,5
3190897,Star Wars,1977,Anthony Daniels,actor,C-3PO,6
718687,Star Wars,1977,Kenny Baker,actor,R2-D2,7
8927834,Star Wars,1977,Peter Mayhew (II),actor,Chewbacca,8
11188372,Star Wars,1977,David Prowse,actor,Darth Vader,9
1750564,Star Wars,1977,Phil Brown,actor,Uncle Owen,11


# Release Dates

In [20]:
def filter_csv(chunk):
    r = chunk
    r = r[r.code == 16]
    #print(r.head())
    r = r[r.note.isnull()]
    r = r[['title_id', 'data']]
    print(len(r), end=' '),
    return r

iter_csv = pd.read_csv(
    'movie_info.csv',
    usecols=[1,2,3,4],
    names=['title_id', 'code', 'data', 'note'],
    dtype={'note': 'str'},
    iterator=True,
    chunksize=100000,
    )
release_dates_raw = pd.concat([filter_csv(chunk) for chunk in iter_csv])
print()

release_dates_raw.head()

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 43330 98511 98037 98496 98743 98440 98750 98202 98581 98584 99025 98333 98705 98493 98485 98506 98236 98486 98779 98572 98699 97964 95503 59544 59524 58540 58693 58693 61520 60700 59150 60118 57798 59205 59032 62141 63470 60937 51732 0 0 


Unnamed: 0,title_id,data
55637,1,Italy:1 January 1994
55638,2,USA:22 January 2006
55639,3,USA:12 February 2006
55640,4,USA:19 February 2006
55641,5,USA:22 January 2006


In [21]:
r = release_dates_raw

r['country'] = r.data.str.extract('^(.*):')
r['date'] = r.data.str.extract(':(.*)$')
del r['data']

r['date'] = pd.to_datetime(r.date, infer_datetime_format=True)
release_dates_all = r
release_dates_all.head()

Unnamed: 0,title_id,country,date
55637,1,Italy,1994-01-01
55638,2,USA,2006-01-22
55639,3,USA,2006-02-12
55640,4,USA,2006-02-19
55641,5,USA,2006-01-22


In [22]:
release_dates = pd.merge(titles[['title', 'year']], release_dates_all,
                         left_index=True, right_on='title_id', sort=False)
del release_dates['title_id']
release_dates = release_dates.drop_duplicates()
release_dates.head()

Unnamed: 0,title,year,country,date
93637,0_1_0,2008,Poland,2008-11-14
53431,Ai no Sanka,1967,Japan,1967-01-01
33904,A Thousand to One,1920,USA,1920-12-07
29244,A Prince of a King,1923,USA,1923-10-13
29245,A Prince of a King,1923,Netherlands,1924-08-08


# Save

In [23]:
titles.to_csv('../data/titles.csv', index=False)

In [24]:
cast.head()

Unnamed: 0,title,year,name,type,character,n
22434,The Core,2003,Alejandro Abellan,actor,U.S.S. Soldier,
3304875,Il momento di uccidere,1968,Remo De Angelis,actor,Dago,9.0
3473346,Across the Divide,1921,Thomas Delmar,actor,Dago,4.0
6648135,Revan,2012,Diego James,actor,Dago,
8250700,Un homme marche dans la ville,1950,Fabien Loris,actor,Dago,12.0


In [25]:
cast.to_csv('../data/cast.csv', index=False)

In [26]:
release_dates.head()

Unnamed: 0,title,year,country,date
93637,0_1_0,2008,Poland,2008-11-14
53431,Ai no Sanka,1967,Japan,1967-01-01
33904,A Thousand to One,1920,USA,1920-12-07
29244,A Prince of a King,1923,USA,1923-10-13
29245,A Prince of a King,1923,Netherlands,1924-08-08


In [27]:
release_dates.to_csv('../data/release_dates.csv', index=False)