In this exercise, we will load a large dataset (film metadata, modified from https://www.kaggle.com/rounakbanik/the-movies-dataset#movies_metadata.csv), extract some parts of it, perform some analysis, and plot the results.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import webbrowser

In [None]:
# Load the csv file have a look at its content:
df = pd.read_csv('movies_dataset.csv')

# df.head()
# Show a random sample of rows in this dataset:
df.sample(10)

In [None]:
# Explore
print(df['overview'][0])
print(df.loc[5436]['original_title'])
print(df.loc[25539]['production_companies'])

In [None]:
# Filter for a given title:
df[df.title=='12 Angry Men']

In [None]:
# Load in IMDb with the imdb_id:
webbrowser.open_new_tab('https://www.imdb.com/title/' + df.loc[1161, 'imdb_id'])

In [None]:
# Drop all the columns we don't need:
# df2 = df.drop(columns=['budget', 'imdb_id', 'original_title', 'overview', 'production_companies', 'revenue', 'title', 'vote_count'])

In [None]:
# Now let's do some analysis with this dataset. 
# Ensure that the columns containing release dates and runtimes are formatted consistently. 
# Remove rows where release_date, runtime, title, original_language are missing.
df.dropna(subset=['release_date', 'runtime', 'title', 'original_language'], inplace=True)

# Delete rows of release_date entries that are too short or don't contain '/'
drop_condition = (df.release_date.str.len() < 8) | ~df.release_date.str.contains('/') 
df.drop(df[drop_condition].index, inplace=True)

# Use only the year in the release_date column (as an integer):
df['release_date'] = df['release_date'].apply(lambda x: int(x[-4:]))

In [None]:
df

In [None]:
# Now, let's look at the durations (label 'runtime'). They should should be ints and larger than 0.
df.drop(df[df.runtime < 1].index, inplace=True)
df['runtime'] = df['runtime'].apply(lambda x: int(x))

# Finally drop rows with empty titles:
df.drop(df[df.title.str.len()==0].index, inplace=True)
           
# Language codes must be exactly two characters:
df.drop(df[df.original_language.str.len()!=2].index, inplace=True)

In [None]:
# Rename two columns:
new_names = {'release_date': 'year',
             'runtime': 'duration'}
df.rename(columns=new_names, inplace=True)
df

In [None]:
durations = df['duration']
print(durations.shape)

years = df['year']
print(years.shape)
type(years)

In [None]:
# What are the mean, median and maximum durations?
print('Mean: ', durations.mean())
print('Median: ', durations.median())
print('Longest duration: ', durations.max())

In [None]:
# Quick plot:
plt.scatter(years, durations)

In [None]:
# Plotting all datapoints is not very useful.
# Group by year and plot the average runtime in each year:
avg_durations_vs_years = df.groupby('year')['duration'].mean()

In [None]:
# Matplotlib can deal with the output (Series)
plt.plot(avg_durations_vs_years)

# And this syntax also works:
# avg_durations_vs_years.plot()

In [None]:
# Look at another language:
df_hindi_subset = df[df.original_language=='hi']
df_hindi_subset

In [None]:
avg_durations_vs_years_hi = df_hindi_subset.groupby('year')['duration'].mean()
# Plot data for all movies and Hindi-only movies:
plt.plot(avg_durations_vs_years, label='all movies')
plt.plot(avg_durations_vs_years_hi, label='Hindi language')
plt.legend(loc='lower right')
plt.title('Variation of movie durations over time')
plt.xlabel('year')
plt.ylabel('duration in min (averaged per year)')

In [None]:
# What about the lengths of movie titles over the years:
# Create a new column containing title length:
df['title_length'] = df['original_title'].apply(lambda x: len(x))
df


In [None]:
titlelength_vs_years = df.groupby('year')['title_length'].mean()
# We can extract the data for processing in numpy:
title_lengths = titlelength_vs_years.to_numpy()
# ...

In [None]:
titlelength_vs_years.plot()