*** File Infos ***

_bom.movie_gross.csv_
- (3387 x 5), [title, studio, domestic_gross, foreign_gross, year]

_imdb.name.basics.csv_
- (606648 x 6), [nconst, primary_name, birth_year, death_year, primary_profession, known_for_titles]

_imdb.title.akas.csv_
- (331703 x 8), [title_id, ordering, title, region, language, types, attributes, is_original_title]

_imdb.title.basics.csv_
- (146144 x 6), [tconst, primary_title, original_title, start_year, runtime_minutes, genres]

_imdb.title.principals.csv_
- (1028186 x 6), [tconst, ordering, nconst, category, job, characters]

_rt.movie_info.tsv_
- (1560 x 12), [id, synopsis, rating, genre, director, writer, theater_date, dvd_date, currency, box_office, runtime, studio]

_rt.reviews.tsv_
- (54432 x 8), [id, review, rating, fresh, critic, top_critic, publisher, date]

_tmdb.movies.csv_
- (26517 x 9), [genre_ids, id, original_language, original_title, popularity, release_date, title, vote_average, vote_count]

_tn.movie_budgets.csv_
- (5782 x 6)), [id, release_date, movie, production_budget, domestic_gross, worldwide_gross]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline

***Cleaning bom.movie_gross.csv***

In [3]:
#loading data into pandas dataframe
data = pd.read_csv('./unzippedData/bom.movie_gross.csv')
bom_movie_gross = pd.DataFrame(data)
bom_movie_gross

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [4]:
#checking which columns have missing values
bom_movie_gross.isnull().sum()

title                0
studio               5
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64

In [5]:
#checking value types within dataframe
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [6]:
#creating function to convert values in foreign_gross to floats
def convert_to_int(x):
    if str(x) == 'nan':
        return x
    x = x.replace(',','')
    return float(x)

In [7]:
#converting values in foreign_gross to floats
bom_movie_gross['foreign_gross'] = bom_movie_gross['foreign_gross'].apply(convert_to_int)

In [8]:
#replacing all NaN values with the median of the column
median = bom_movie_gross['foreign_gross'].median()
bom_movie_gross.foreign_gross.fillna(median, inplace = True)
bom_movie_gross.isna().sum()

title              0
studio             5
domestic_gross    28
foreign_gross      0
year               0
dtype: int64

In [9]:
#dropping all rows that have domestic movie gross as empty
to_drop = bom_movie_gross[bom_movie_gross['domestic_gross'].isna()].index
bom_movie_gross.drop(to_drop, inplace = True)
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3359 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3359 non-null   object 
 1   studio          3356 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   3359 non-null   float64
 4   year            3359 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 157.5+ KB


***Cleaning imdb.name.basics***

In [10]:
data = pd.read_csv('./unzippedData/imdb.name.basics.csv')
imdb_name_basics = pd.DataFrame(data)
imdb_name_basics.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [11]:
imdb_name_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   nconst              606648 non-null  object 
 1   primary_name        606648 non-null  object 
 2   birth_year          82736 non-null   float64
 3   death_year          6783 non-null    float64
 4   primary_profession  555308 non-null  object 
 5   known_for_titles    576444 non-null  object 
dtypes: float64(2), object(4)
memory usage: 27.8+ MB


In [112]:
#removing death_year column
imdb_name_basics.drop(['death_year'],axis = 1, inplace = True)
imdb_name_basics.head()

Unnamed: 0,nconst,primary_name,birth_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


*** Cleaning imdb_title_akas ***

In [12]:
data = pd.read_csv('./unzippedData/imdb.title.akas.csv')
imdb_title_akas = pd.DataFrame(data)
imdb_title_akas.head(50)

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
5,tt0369610,15,Jurassic World,GR,,imdbDisplay,,0.0
6,tt0369610,16,Jurassic World,IT,,imdbDisplay,,0.0
7,tt0369610,17,Jurski svijet,HR,,imdbDisplay,,0.0
8,tt0369610,18,Olam ha'Yura,IL,he,imdbDisplay,,0.0
9,tt0369610,19,Jurassic World: Mundo Jurásico,MX,,imdbDisplay,,0.0


In [6]:
data = pd.read_csv('./unzippedData/imdb.title.basics.csv')
imdb_title_basics = pd.DataFrame(data)
imdb_title_basics.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [11]:
data = pd.read_csv('./unzippedData/imdb.title.principals.csv')
imdb_title_principles = pd.DataFrame(data)
imdb_title_principles

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"
...,...,...,...,...,...,...
1028181,tt9692684,1,nm0186469,actor,,"[""Ebenezer Scrooge""]"
1028182,tt9692684,2,nm4929530,self,,"[""Herself"",""Regan""]"
1028183,tt9692684,3,nm10441594,director,,
1028184,tt9692684,4,nm6009913,writer,writer,


In [20]:
data = pd.read_csv('./unzippedData/rt.movie_info.tsv', sep = '\t')
rt_movie_info = pd.DataFrame(data)
rt_movie_info.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [24]:
data = pd.read_csv('./unzippedData/rt.reviews.tsv', sep = '\t', engine = 'python')
rt_reviews = pd.DataFrame(data)
rt_reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [54]:
data = pd.read_csv('./unzippedData/tmdb.movies.csv', engine = 'python')
tmdb_movies = pd.DataFrame(data)
tmdb_movies.drop(['Unnamed: 0'], axis=1, inplace = True)
tmdb_movies.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [34]:
data = pd.read_csv('./unzippedData/tn.movie_budgets.csv')
tn_movie_budgets = pd.DataFrame(data)
tn_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
