In [1]:
import pandas as pd
import glob
import requests
import itertools
from bs4 import BeautifulSoup

In [2]:
''' List the tsv files stored in the IMDb-Datasets directory that we will need to use '''
file_list = glob.glob('IMDb-Datasets/*.tsv')
file_list

['IMDb-Datasets/title.ratings.tsv',
 'IMDb-Datasets/title.principals.tsv',
 'IMDb-Datasets/name.basics.tsv',
 'IMDb-Datasets/title.basics.tsv',
 'IMDb-Datasets/title.episode.tsv']

In [3]:
''' Create a dictionary of dataframes from the IMDb datasets directory '''
imdb_dfs = {}
for file in file_list:
    df_name = file.split('.')
    df_name = df_name[0] + '.' + df_name[1] +'_df' # Example: df_name = title.ratings_df
    df_name = df_name.split('/')[1]
    imdb_dfs[df_name] = pd.read_csv(file, sep='\t', low_memory = False)

In [4]:
''' title.basics_df contains tconst for media in IMDb datasets  '''
imdb_dfs['title.basics_df'].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
the_office_df = imdb_dfs['title.basics_df'].loc[(imdb_dfs['title.basics_df'].primaryTitle == 'The Office') 
                                              & (imdb_dfs['title.basics_df'].titleType == 'tvSeries') , :]
the_office_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
109696,tt0112108,tvSeries,The Office,The Office,0,1995,\N,30,Comedy
279011,tt0290978,tvSeries,The Office,The Office,0,2001,2003,30,"Comedy,Drama"
371064,tt0386676,tvSeries,The Office,The Office,0,2005,2013,22,Comedy
1977861,tt1791001,tvSeries,The Office,Ha-Misrad,0,2010,\N,25,Comedy
2354182,tt2186395,tvSeries,The Office,The Office,0,2012,\N,5,Comedy


In [6]:
the_office_df['startYear']

109696     1995
279011     2001
371064     2005
1977861    2010
2354182    2012
Name: startYear, dtype: object

In [7]:
the_office_df = the_office_df.loc[the_office_df.startYear == '2005', :]
the_office_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
371064,tt0386676,tvSeries,The Office,The Office,0,2005,2013,22,Comedy


In [8]:
the_office_columns = ['primaryTitle', 'tconst']
the_office_df = the_office_df[the_office_columns]
the_office_df

Unnamed: 0,primaryTitle,tconst
371064,The Office,tt0386676


In [9]:
imdb_dfs['title.episode_df'].head()

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16


In [10]:
the_office_df = the_office_df.merge(imdb_dfs['title.episode_df'], left_on=the_office_df.tconst,
                                    right_on=imdb_dfs['title.episode_df'].parentTconst,
                                    suffixes=('_series', '_episode'), how='inner')
the_office_df.head()

Unnamed: 0,key_0,primaryTitle,tconst_series,tconst_episode,parentTconst,seasonNumber,episodeNumber
0,tt0386676,The Office,tt0386676,tt0664510,tt0386676,1,5
1,tt0386676,The Office,tt0386676,tt0664511,tt0386676,2,11
2,tt0386676,The Office,tt0386676,tt0664512,tt0386676,2,15
3,tt0386676,The Office,tt0386676,tt0664513,tt0386676,2,10
4,tt0386676,The Office,tt0386676,tt0664514,tt0386676,1,2


In [11]:
the_office_df = the_office_df.loc[the_office_df.seasonNumber == '1', :]
the_office_df

Unnamed: 0,key_0,primaryTitle,tconst_series,tconst_episode,parentTconst,seasonNumber,episodeNumber
0,tt0386676,The Office,tt0386676,tt0664510,tt0386676,1,5
4,tt0386676,The Office,tt0386676,tt0664514,tt0386676,1,2
7,tt0386676,The Office,tt0386676,tt0664517,tt0386676,1,3
8,tt0386676,The Office,tt0386676,tt0664518,tt0386676,1,6
11,tt0386676,The Office,tt0386676,tt0664521,tt0386676,1,1
13,tt0386676,The Office,tt0386676,tt0664523,tt0386676,1,4


In [12]:
the_office_columns[1] = 'tconst_series'
new_columns = ['tconst_episode', 'seasonNumber', 'episodeNumber']
for column in new_columns:
    the_office_columns.append(column)
the_office_df = the_office_df[the_office_columns]
the_office_df

Unnamed: 0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber
0,The Office,tt0386676,tt0664510,1,5
4,The Office,tt0386676,tt0664514,1,2
7,The Office,tt0386676,tt0664517,1,3
8,The Office,tt0386676,tt0664518,1,6
11,The Office,tt0386676,tt0664521,1,1
13,The Office,tt0386676,tt0664523,1,4


In [13]:
imdb_dfs['title.ratings_df'].head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1513
1,tt0000002,6.3,184
2,tt0000003,6.6,1160
3,tt0000004,6.3,113
4,tt0000005,6.2,1865


In [14]:
the_office_df = the_office_df = the_office_df.merge(imdb_dfs['title.ratings_df'], left_on=the_office_df.tconst_episode,
                                    right_on=imdb_dfs['title.ratings_df'].tconst,
                                    suffixes=('_episode', '_ratings'), how='inner')
the_office_df

Unnamed: 0,key_0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,tconst,averageRating,numVotes
0,tt0664510,The Office,tt0386676,tt0664510,1,5,tt0664510,8.4,3132
1,tt0664514,The Office,tt0386676,tt0664514,1,2,tt0664514,8.3,3507
2,tt0664517,The Office,tt0386676,tt0664517,1,3,tt0664517,7.9,2939
3,tt0664518,The Office,tt0386676,tt0664518,1,6,tt0664518,7.8,2820
4,tt0664521,The Office,tt0386676,tt0664521,1,1,tt0664521,7.6,3644
5,tt0664523,The Office,tt0386676,tt0664523,1,4,tt0664523,8.1,2846


In [15]:
the_office_columns.append('averageRating')
the_office_df = the_office_df[the_office_columns]
the_office_df.head()

Unnamed: 0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating
0,The Office,tt0386676,tt0664510,1,5,8.4
1,The Office,tt0386676,tt0664514,1,2,8.3
2,The Office,tt0386676,tt0664517,1,3,7.9
3,The Office,tt0386676,tt0664518,1,6,7.8
4,The Office,tt0386676,tt0664521,1,1,7.6


In [16]:
imdb_dfs['title.principals_df'].head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Herself""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [17]:
the_office_df = the_office_df = the_office_df.merge(imdb_dfs['title.principals_df'], 
                                                    left_on=the_office_df.tconst_episode,
                                                    right_on=imdb_dfs['title.principals_df'].tconst,
                                                    suffixes=('_episode', '_principals'), how='inner')
the_office_df.head()

Unnamed: 0,key_0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating,tconst,ordering,nconst,category,job,characters
0,tt0664510,The Office,tt0386676,tt0664510,1,5,8.4,tt0664510,10,nm0251848,cinematographer,director of photography,\N
1,tt0664510,The Office,tt0386676,tt0664510,1,5,8.4,tt0664510,1,nm0136797,actor,\N,"[""Michael Scott""]"
2,tt0664510,The Office,tt0386676,tt0664510,1,5,8.4,tt0664510,2,nm0933988,actor,\N,"[""Dwight Schrute""]"
3,tt0664510,The Office,tt0386676,tt0664510,1,5,8.4,tt0664510,3,nm1024677,actor,\N,"[""Jim Halpert""]"
4,tt0664510,The Office,tt0386676,tt0664510,1,5,8.4,tt0664510,4,nm0278979,actress,\N,"[""Pam Beesly""]"


In [18]:
the_office_df = the_office_df.sort_values(by=['episodeNumber', 'ordering'])
the_office_df.head()

Unnamed: 0,key_0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating,tconst,ordering,nconst,category,job,characters
41,tt0664521,The Office,tt0386676,tt0664521,1,1,7.6,tt0664521,1,nm0136797,actor,\N,"[""Michael Scott""]"
42,tt0664521,The Office,tt0386676,tt0664521,1,1,7.6,tt0664521,2,nm0933988,actor,\N,"[""Dwight Schrute""]"
43,tt0664521,The Office,tt0386676,tt0664521,1,1,7.6,tt0664521,3,nm1024677,actor,\N,"[""Jim Halpert""]"
44,tt0664521,The Office,tt0386676,tt0664521,1,1,7.6,tt0664521,4,nm0278979,actress,\N,"[""Pam Beesly""]"
45,tt0664521,The Office,tt0386676,tt0664521,1,1,7.6,tt0664521,5,nm0477129,director,\N,\N


In [19]:
new_columns = ['nconst', 'category', 'characters']
for column in new_columns:
    the_office_columns.append(column)

In [20]:
the_office_df = the_office_df.loc[the_office_df.category == 'actor', 
                                  the_office_columns].reset_index(drop=True)
the_office_df.head()

Unnamed: 0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating,nconst,category,characters
0,The Office,tt0386676,tt0664521,1,1,7.6,nm0136797,actor,"[""Michael Scott""]"
1,The Office,tt0386676,tt0664521,1,1,7.6,nm0933988,actor,"[""Dwight Schrute""]"
2,The Office,tt0386676,tt0664521,1,1,7.6,nm1024677,actor,"[""Jim Halpert""]"
3,The Office,tt0386676,tt0664514,1,2,8.3,nm0136797,actor,"[""Michael Scott""]"
4,The Office,tt0386676,tt0664514,1,2,8.3,nm0933988,actor,"[""Dwight Schrute""]"


In [21]:
for column in new_columns:
    del the_office_df[column]
    the_office_columns.remove(column)
the_office_df = the_office_df.drop_duplicates().reset_index(drop=True)
the_office_df.head()

Unnamed: 0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating
0,The Office,tt0386676,tt0664521,1,1,7.6
1,The Office,tt0386676,tt0664514,1,2,8.3
2,The Office,tt0386676,tt0664517,1,3,7.9
3,The Office,tt0386676,tt0664523,1,4,8.1
4,The Office,tt0386676,tt0664510,1,5,8.4


In [22]:
the_office_df = the_office_df.astype('object')
for ep_num, ep in the_office_df.tconst_episode.iteritems():
    
    x = ep_num + 1
    url = f'https://www.imdb.com/title/{ep}/?ref_=ttep_ep{x}'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    cast_odd = soup.find_all('tr', class_='odd')
    cast_odd_list = []
    for cast_member in cast_odd:
        cast_odd_list.append(str(cast_member.find_all('a')[0].contents[0])
                             .split('title=')[1].split('\"')[1].replace('\n','').strip())

    cast_even = soup.find_all('tr', class_='even')
    cast_even_list = []
    for cast_member in cast_even:
        cast_even_list.append(str(cast_member.find_all('a')[0].contents[0])
                             .split('title=')[1].split('\"')[1].replace('\n','').strip())

    cast_list = list(itertools.chain
                           .from_iterable(zip(cast_odd_list, cast_even_list)))

    characters_odd = soup.find_all('tr', class_='odd')
    characters_odd_list = []
    for character in characters_odd:
        characters_odd_list.append(character.find('td', class_='character')
                                   .text.replace('\n','').strip())

    characters_even = soup.find_all('tr', class_='even')
    characters_even_list = []
    for character in characters_even:
        characters_even_list.append(character.find('td', class_='character')
                                    .text.replace('\n','').split('(')[0].strip())


    characters_list = list(itertools.chain
                               .from_iterable(zip(characters_odd_list, characters_even_list)))
    
    the_office_df.loc[ep_num, 'cast'] = 'a'
    the_office_df.loc[ep_num, 'characters'] = 'a'
    the_office_df.loc[ep_num, 'cast'] = cast_list
    the_office_df.loc[ep_num, 'characters'] = characters_list
    
the_office_df.head()

Unnamed: 0,primaryTitle,tconst_series,tconst_episode,seasonNumber,episodeNumber,averageRating,cast,characters
0,The Office,tt0386676,tt0664521,1,1,7.6,"[Steve Carell, Rainn Wilson, John Krasinski, J...","[Michael Scott, Dwight Schrute, Jim Halpert, P..."
1,The Office,tt0386676,tt0664514,1,2,8.3,"[Steve Carell, Rainn Wilson, John Krasinski, J...","[Michael Scott, Dwight Schrute, Jim Halpert, P..."
2,The Office,tt0386676,tt0664517,1,3,7.9,"[Steve Carell, Rainn Wilson, John Krasinski, J...","[Michael Scott, Dwight Schrute, Jim Halpert, P..."
3,The Office,tt0386676,tt0664523,1,4,8.1,"[Steve Carell, Rainn Wilson, John Krasinski, J...","[Michael Scott, Dwight Schrute, Jim Halpert, P..."
4,The Office,tt0386676,tt0664510,1,5,8.4,"[Steve Carell, Rainn Wilson, John Krasinski, J...","[Michael Scott, Dwight Schrute, Jim Halpert, P..."


In [23]:
the_office_df['characters'][2]

['Michael Scott',
 'Dwight Schrute',
 'Jim Halpert',
 'Pam Beesly',
 'Ryan Howard',
 'Jan Levinson-Gould',
 'Stanley Hudson',
 'Kevin Malone',
 'Meredith Palmer',
 'Travel Agent',
 'Kelly Kapoor',
 'Angela Martin',
 'Toby Flenderson',
 'Oscar Martinez']