In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read the dataset files and load into a dictionary of pandas dataframes.

In [2]:
import pandas as pd
import os

data_dir = '/content/drive/MyDrive/IMDB data/'


file_names = [f for f in os.listdir(data_dir) if f.endswith('.tsv.gz')]

print(f"Found {len(file_names)} IMDB data files:\n{file_names}\n")

imdb_dfs = {}

for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    print(f"Processing: {file_name}")

    try:
        # Read the gzipped TSV file into a pandas DataFrame
        df = pd.read_csv(file_path, sep='\t', compression='gzip', low_memory=False)
        df_name = file_name.replace('.tsv.gz', '').replace('.', '_')
        imdb_dfs[df_name] = df

        print(f"\n--- {file_name} Head ---")
        print(df.head())
        print(f"\n--- {file_name} Info ---")
        df.info()

    except Exception as e:
        print(f"Error loading {file_name}: {e}")

print(f"Available DataFrames: {list(imdb_dfs.keys())}")

Found 7 IMDB data files:
['name.basics.tsv.gz', 'title.episode.tsv.gz', 'title.crew.tsv.gz', 'title.ratings.tsv.gz', 'title.basics.tsv.gz', 'title.akas.tsv.gz', 'title.principals.tsv.gz']

Processing: name.basics.tsv.gz

--- name.basics.tsv.gz Head ---
      nconst      primaryName birthYear deathYear  \
0  nm0000001     Fred Astaire      1899      1987   
1  nm0000002    Lauren Bacall      1924      2014   
2  nm0000003  Brigitte Bardot      1934        \N   
3  nm0000004     John Belushi      1949      1982   
4  nm0000005   Ingmar Bergman      1918      2007   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0072308,tt0050419,tt0027125,tt0025164  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0038355,tt0117057  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,tt0080455,tt0078723  
4               writer,

### Filter only to get movies and movies which have >1000 upvotes

In [3]:

title_df = imdb_dfs['title_basics']
title_ratings = imdb_dfs['title_ratings']
popular_titles = title_ratings[title_ratings['numVotes'] > 1000]
movie_df = title_df[title_df['titleType']=='movie']
movie_df.head()
print(movie_df.head())
movie_df_popular = pd.merge(movie_df, popular_titles[['tconst']], on='tconst', how='inner')
display(movie_df_popular.head())

        tconst titleType                   primaryTitle  \
8    tt0000009     movie                     Miss Jerry   
144  tt0000147     movie  The Corbett-Fitzsimmons Fight   
331  tt0000335     movie          Soldiers of the Cross   
498  tt0000502     movie                       Bohemios   
570  tt0000574     movie    The Story of the Kelly Gang   

                     originalTitle  isAdult startYear endYear runtimeMinutes  \
8                       Miss Jerry        0      1894      \N             45   
144  The Corbett-Fitzsimmons Fight        0      1897      \N            100   
331          Soldiers of the Cross        0      1900      \N             40   
498                       Bohemios        0      1905      \N            100   
570    The Story of the Kelly Gang        0      1906      \N             70   

                         genres  
8                       Romance  
144      Documentary,News,Sport  
331             Biography,Drama  
498                         

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
1,tt0002130,movie,Dante's Inferno,L'inferno,0,1911,\N,71,"Adventure,Drama,Fantasy"
2,tt0002423,movie,Passion,Madame DuBarry,0,1919,\N,113,"Biography,Drama,Romance"
3,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas I: À l'ombre de la guillotine,0,1913,\N,54,"Crime,Drama"
4,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,0,1913,\N,96,Drama


### Filter further to include only english language movies of last 50 years.

In [4]:
title_akas = imdb_dfs['title_akas']

english_titles = title_akas[(title_akas['language'] == 'en')]

movie_df_popular = movie_df_popular[movie_df_popular['tconst'].isin(english_titles['titleId'])]

movie2_df = movie_df_popular[movie_df_popular['startYear']!=r'\N'].copy()
movie2_df['startYear'] = movie2_df['startYear'].astype(int)
movie2_df = movie2_df[movie2_df['genres']!=r'\N']
movie2_df = movie2_df[movie2_df['startYear'] > 1970]
print(movie2_df.shape)

(37213, 9)


### Filter and take only actors, directors and actress from people dataset based on the movies we have filtered out.

In [5]:
title_principles = imdb_dfs['title_principals']
print(title_principles.head())

actor_director_relation_df = title_principles[(title_principles['category'] == 'actor') | (title_principles['category'] == 'actress') | (title_principles['category'] == 'director')]
actor_director_relation_df = actor_director_relation_df[actor_director_relation_df['tconst'].isin(movie2_df['tconst'])]
print(actor_director_relation_df.head(), actor_director_relation_df.shape)
# Add ratings to filtered movies
movie_with_ratings_df = pd.merge(movie2_df, title_ratings, on='tconst', how='inner')


      tconst  ordering     nconst         category                      job  \
0  tt0000001         1  nm1588970             self                       \N   
1  tt0000001         2  nm0005690         director                       \N   
2  tt0000001         3  nm0005690         producer                 producer   
3  tt0000001         4  nm0374658  cinematographer  director of photography   
4  tt0000002         1  nm0721526         director                       \N   

  characters  
0   ["Self"]  
1         \N  
2         \N  
3         \N  
4         \N  
           tconst  ordering     nconst category job         characters
418998  tt0035423         1  nm0000212  actress  \N     ["Kate McKay"]
418999  tt0035423         2  nm0413168    actor  \N        ["Leopold"]
419000  tt0035423         3  nm0000630    actor  \N  ["Stuart Besser"]
419001  tt0035423         4  nm0005227    actor  \N  ["Charlie McKay"]
419002  tt0035423         5  nm0005169  actress  \N          ["Darci"] (399849, 

In [6]:
region_map = english_titles[['titleId', 'region']].drop_duplicates(subset=['titleId'])

movie_with_ratings_df = pd.merge(movie_with_ratings_df, region_map, left_on='tconst', right_on='titleId', how='left')

movie_with_ratings_df.drop(columns=['titleId'], inplace=True)

print("Shape of movie_with_ratings_df after adding region column:", movie_with_ratings_df.shape)

Shape of movie_with_ratings_df after adding region column: (37213, 12)


In [8]:
name_basics = imdb_dfs['name_basics']
people_df = name_basics[name_basics['nconst'].isin(actor_director_relation_df['nconst'])]
print(people_df.shape)
display(people_df.head())

(163434, 6)


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


### Example filtered datapoint, lookup the movie "Inception" in the dataset

In [9]:
display(movie_with_ratings_df[movie_with_ratings_df['primaryTitle']=='Inception'])
inception_actors = actor_director_relation_df[actor_director_relation_df['tconst']=='tt1375666']

display(name_basics[name_basics['nconst'].isin(inception_actors['nconst'])])

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,region
19367,tt1375666,movie,Inception,Inception,0,2010,\N,148,"Action,Adventure,Sci-Fi",8.8,2745910,ID


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
137,nm0000138,Leonardo DiCaprio,1974,\N,"producer,actor,writer","tt1375666,tt0407887,tt0120338,tt0993846"
296,nm0000297,Tom Berenger,1949,\N,"actor,producer,writer","tt0091763,tt1375666,tt0085244,tt0097815"
591,nm0000592,Pete Postlethwaite,1946,2011,"actor,producer,archive_footage","tt0114814,tt0840361,tt0119567,tt1375666"
173388,nm0182839,Marion Cotillard,1975,\N,"actress,producer,writer","tt1375666,tt2053425,tt2737050,tt0450188"
312725,nm0330687,Joseph Gordon-Levitt,1981,\N,"actor,producer,writer","tt1375666,tt1276104,tt2229499,tt1306980"
342831,nm0362766,Tom Hardy,1977,\N,"actor,producer,writer","tt1375666,tt1345836,tt2692904,tt1392190"
578773,nm0614165,Cillian Murphy,1976,\N,"actor,producer,writer","tt0411195,tt0289043,tt0468569,tt15398776"
597592,nm0634240,Christopher Nolan,1970,\N,"producer,writer,director","tt6723592,tt1375666,tt0816692,tt5013056"
641422,nm0680983,Elliot Page,1987,\N,"actor,producer,writer","tt0467406,tt1375666,tt0424136,tt1877832"
858827,nm0913822,Ken Watanabe,1959,\N,"actor,producer,miscellaneous","tt0325710,tt1375666,tt0831387,tt2109248"


### Filtered dataset

In [10]:
print(movie_with_ratings_df.shape)
display(movie_with_ratings_df.head())

print(actor_director_relation_df.shape)
display(actor_director_relation_df.head())

print(people_df.shape)
display(people_df.head())

(37213, 12)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,region
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance",6.4,92761,HK
1,tt0054724,movie,I Eat Your Skin,Zombie,0,1971,\N,92,Horror,3.6,1792,IN
2,tt0061592,movie,Doomsday Machine,Doomsday Machine,0,1976,\N,83,Sci-Fi,2.6,1526,CA
3,tt0063142,movie,Isle of the Snake People,La muerte viviente,0,1971,\N,90,"Horror,Mystery",3.4,1135,CA
4,tt0064451,movie,A Touch of Zen,Xia nü,0,1971,\N,200,"Action,Adventure,Drama",7.5,7996,XWW


(399849, 6)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
418998,tt0035423,1,nm0000212,actress,\N,"[""Kate McKay""]"
418999,tt0035423,2,nm0413168,actor,\N,"[""Leopold""]"
419000,tt0035423,3,nm0000630,actor,\N,"[""Stuart Besser""]"
419001,tt0035423,4,nm0005227,actor,\N,"[""Charlie McKay""]"
419002,tt0035423,5,nm0005169,actress,\N,"[""Darci""]"


(163434, 6)


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


### Save Filtered data to CSV files

In [None]:
# Define the directory to save the CSV files
output_dir = '/content/drive/MyDrive/IMDB_processed_data/'

# Create the directory if it doesn't exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save movie2_df to CSV
movie_with_ratings_df.to_csv(os.path.join(output_dir, 'movies.csv'), index=False)
print(f"movie2_df saved to {os.path.join(output_dir, 'movies.csv')}")

# Save actor_director_relation_df to CSV
actor_director_relation_df.to_csv(os.path.join(output_dir, 'actor_director_movies_edges.csv'), index=False)
print(f"actor_director_relation_df saved to {os.path.join(output_dir, 'actor_director_relation.csv')}")

# Save people_df to CSV
people_df.to_csv(os.path.join(output_dir, 'people.csv'), index=False)
print(f"people_df saved to {os.path.join(output_dir, 'people.csv')}")