In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
# Import the movies dataset
movies_df = pd.read_csv("data/movies_metadata.csv", low_memory=False)

In [3]:
# Over 45k movies
len(movies_df)

45466

In [4]:
# Inspect the movies columns
# Belongs to collection contains dictionaries
# Genres and spoken languages contain lists of dictionaries, each with a name field that will need to be extracted
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
# Some fields have unexpected types - change budget to float and consider changing release_date to a datetime
movies_df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [6]:
# Cast certain columns to strings to enable filtering later
movies_df[['adult', 'original_language', 'original_title', 'overview',
          'status', 'tagline', 'video']] = movies_df[['adult', 'original_language', 'original_title', 'overview',
          'status', 'tagline', 'video']].astype(str)

In [7]:
# Some columns contain missing values that will need to be addressed
# The number is greater than shown, due to empty lists within some fields
movies_df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language            0
original_title               0
overview                     0
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                       0
tagline                      0
title                        6
video                        0
vote_average                 6
vote_count                   6
dtype: int64

In [8]:
# There are few adult movies in the dataset, as well as 2 movies with descriptions in this field; drop these rows
movies_df['adult'].value_counts()

False                                                                                                                             45454
True                                                                                                                                  9
 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.        1
 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.                        1
 - Written by Ørnås                                                                                                                   1
Name: adult, dtype: int64

In [9]:
# The majority of movies don't belong to a collection - suggest one/hot encoding to indicate yes/no to in collection
print(sum(~movies_df['belongs_to_collection'].isna()))
print(sum(movies_df['belongs_to_collection'].isna()))

4494
40972


In [10]:
# The majority of movies are in English, so let's consider narrowing the dataset to these
movies_df.original_language.value_counts()

en      32269
fr       2438
it       1529
ja       1350
de       1080
        ...  
lb          1
eo          1
fy          1
cy          1
82.0        1
Name: original_language, Length: 93, dtype: int64

In [11]:
len(movies_df[movies_df.vote_count >= 100])

6055

In [12]:
# Non-English movies 
# 25% of movies have 3 or less votes
print(movies_df[movies_df.original_language != 'en'].vote_count.quantile(.25))

# Half of movies have 10 or fewer votes
print(movies_df[movies_df.original_language != 'en'].vote_count.quantile(.5))

# 25% of movies have 23 or more votes
print(movies_df[movies_df.original_language != 'en'].vote_count.quantile(.75))

# Only 10% of movies have >=63 votes
print(movies_df[movies_df.original_language != 'en'].vote_count.quantile(.9))

3.0
9.0
23.0
63.0


In [13]:
# English movies 
# 25% of movies have 3 or less votes
print(movies_df[movies_df.original_language == 'en'].vote_count.quantile(.25))

# Half of movies have 10 or fewer votes
print(movies_df[movies_df.original_language == 'en'].vote_count.quantile(.5))

# 25% of English films have 43 or more votes
print(movies_df[movies_df.original_language == 'en'].vote_count.quantile(.75))

# Only 25% of movies have >34 votes
print(movies_df[movies_df.original_language == 'en'].vote_count.quantile(.9))

3.0
10.0
43.0
241.0


In [14]:
# Replace NaN values in collection column with 0s
# For movies with collections, replace with 1s
movies_df.belongs_to_collection.where(movies_df.belongs_to_collection.isnull(), 1, inplace=True)
movies_df.belongs_to_collection.fillna(0, inplace=True)

In [15]:
# Extract the name field from genres - will now contain lists of values or empty lists for NaN
movies_df['genres'] = [list(set([y['name'] for y in x])) for x in movies_df['genres'].apply(ast.literal_eval)]

In [16]:
# The majority of movies have been released, filter the dataset for these titles
movies_df['status'].value_counts()

Released           45014
Rumored              230
Post Production       98
nan                   87
In Production         20
Planned               15
Canceled               2
Name: status, dtype: int64

In [17]:
# Filter the dataframe for movies with adult status of False, an original language of English, released, and video False
filtered_movies = movies_df[(movies_df.adult == 'False') & (movies_df.original_language == 'en') 
                            & (movies_df.status == 'Released') & (movies_df.video == 'False')]

In [18]:
# Select only the necessary columns for the analysis
final_movies = filtered_movies[['id', 'belongs_to_collection', 'budget', 'genres',
                               'original_title', 'overview', 'production_companies', 'production_companies',
                                'release_date', 'revenue', 'runtime', 'spoken_languages', 
                                'tagline', 'title', 'vote_average', 'vote_count']].copy()

In [19]:
final_movies.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,original_title,overview,production_companies,production_companies.1,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count
0,862,1,30000000,"[Family, Animation, Comedy]",Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'name': 'Pixar Animation Studios', 'id': 3}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0
1,8844,0,65000000,"[Adventure, Fantasy, Family]",Jumanji,When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'name': 'TriStar Pictures', 'id': 559}, {'na...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,15602,1,0,"[Romance, Comedy]",Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,31357,0,16000000,"[Drama, Romance, Comedy]",Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",[{'name': 'Twentieth Century Fox Film Corporat...,[{'name': 'Twentieth Century Fox Film Corporat...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,11862,1,0,[Comedy],Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'name': 'Sandollar Productions', 'id': 5842}...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


In [20]:
# Now extract the spoken languages field
final_movies['spoken_languages'] = [list(set([y['name'] for y in x])) for x in final_movies['spoken_languages'].apply(ast.literal_eval)]

In [21]:
# Filter out movies without a release date or a runtime
final_movies = final_movies[(final_movies.release_date.notnull()) & (final_movies.runtime.notnull())]

In [22]:
# Convert the release date column to a datetime field
final_movies['release_date'] = pd.to_datetime(final_movies['release_date'])

In [23]:
# The number of movies released is much higher in some years than others
pd.to_datetime(final_movies['release_date']).dt.year.value_counts()[:25]

2014    1369
2013    1314
2015    1257
2012    1188
2011    1111
2009    1110
2016    1066
2008    1015
2010     984
2007     930
2006     864
2005     766
2002     638
2004     634
2003     604
2001     602
2000     552
1998     540
1999     532
1997     488
1996     476
1995     448
1994     420
2017     388
1993     371
Name: release_date, dtype: int64

In [24]:
final_movies.dtypes

id                               object
belongs_to_collection             int64
budget                           object
genres                           object
original_title                   object
overview                         object
production_companies             object
production_companies             object
release_date             datetime64[ns]
revenue                         float64
runtime                         float64
spoken_languages                 object
tagline                          object
title                            object
vote_average                    float64
vote_count                      float64
dtype: object

In [25]:
# Extract the release decade of each film
final_movies['release_decade'] = (final_movies.release_date.dt.year//10)*10

In [26]:
# Consider narrowing the timeframe of releaase dates to be evaluated
# The first movie ever made was made in 1888, so 1870 is an error
print(final_movies.release_decade.value_counts())

# Find the vote count for movies by decade
print('Below are the total vote counts by the movie\'s release decade - votes are greater for newer films')
final_movies.groupby(['release_decade'])['vote_count'].sum()

2010    8677
2000    7715
1990    4249
1980    2820
1970    2128
1950    1615
1960    1567
1940    1340
1930    1146
1920     328
1910     136
1900      52
1890      34
1870       1
1880       1
Name: release_decade, dtype: int64
Below are the total vote counts by the movie's release decade - votes are greater for newer films


release_decade
1870         25.0
1880          7.0
1890        220.0
1900        404.0
1910       1121.0
1920       3918.0
1930      18394.0
1940      29413.0
1950      45510.0
1960      60284.0
1970     120382.0
1980     268403.0
1990     597047.0
2000    1383720.0
2010    2034891.0
Name: vote_count, dtype: float64