In [1]:
#If needed, depending on where this code will be run, please run:

#pip install pandas
#pip insall numpy
#pip install matplotlib
#pip install seaborn
#pip install plotly


In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

# Loading the dataset and preliminary steps

In [3]:
movies = pd.read_csv('films.csv')
movies.head()

Unnamed: 0,rank,film_title,film_year,overall_rating,language,genre,mpaa_rating,director,actors,plot_summary
0,1,The Shawshank Redemption,1994,3.5,English,Drama,R,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","A young banker named Andy, has his life change..."
1,2,Fight Club,1999,3.3,English,Drama/Crime,R,David Fincher,"Edward Norton, Brad Pitt, Helena Bonham Carter...",An average thirty-something office worker gets...
2,3,The Godfather,1972,3.5,English,Drama/Crime,R,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Robert D...",An organized crime leader gives up control of ...
3,4,The Dark Knight,2008,3.4,English,Action/Adventure,PG-13,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Batman raises the stakes in his war on crime. ...
4,5,Pulp Fiction,1994,3.4,English,Crime/Drama,R,Quentin Tarantino,"John Travolta, Samuel L. Jackson, Bruce Willis...",A story of four interweaving stories developin...


## Checking data types

In [4]:
movies.dtypes

rank                int64
film_title         object
film_year           int64
overall_rating    float64
language           object
genre              object
mpaa_rating        object
director           object
actors             object
plot_summary       object
dtype: object

## Checking null values

In [5]:
movies.isnull().sum()

rank               0
film_title         0
film_year          0
overall_rating     0
language           1
genre              0
mpaa_rating        2
director           0
actors             4
plot_summary      37
dtype: int64

The null values have been spot checked and that information is actually missing from the website, indicating that there is no issue - as it seems - with the scraper itself. Those null values will be excluded when/if needed later on during the analysis

## Checking duplicate rows

In [6]:
movies.duplicated().sum()

np.int64(0)

## Formatting/cleaning of relevant features and general feature information

As multiple actors, directors or genres are together in the same column, it is convenient to use list comprehension so that insights can be explored more effectively (for example looking at actors that worked together, exploring different combinations of genres over time etc.). As actors have missing values, a condition will be included to return an empty list in that case

In [7]:
movies['genre'] = [x.split('/') for x in movies['genre']]
movies['director'] = [x.split(',' ) for x in movies['director']]
movies['actors'] = [x.split(',') if pd.notna(x) else [] for x in movies['actors']]

In [8]:
movies.head(20)

Unnamed: 0,rank,film_title,film_year,overall_rating,language,genre,mpaa_rating,director,actors,plot_summary
0,1,The Shawshank Redemption,1994,3.5,English,[Drama],R,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, W...","A young banker named Andy, has his life change..."
1,2,Fight Club,1999,3.3,English,"[Drama, Crime]",R,[David Fincher],"[Edward Norton, Brad Pitt, Helena Bonham Car...",An average thirty-something office worker gets...
2,3,The Godfather,1972,3.5,English,"[Drama, Crime]",R,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Robe...",An organized crime leader gives up control of ...
3,4,The Dark Knight,2008,3.4,English,"[Action, Adventure]",PG-13,[Christopher Nolan],"[Christian Bale, Heath Ledger, Aaron Eckhart...",Batman raises the stakes in his war on crime. ...
4,5,Pulp Fiction,1994,3.4,English,"[Crime, Drama]",R,[Quentin Tarantino],"[John Travolta, Samuel L. Jackson, Bruce Wil...",A story of four interweaving stories developin...
5,6,Forrest Gump,1994,3.3,English,"[Drama, Comedy]",PG-13,[Robert Zemeckis],"[Tom Hanks, Robin Wright Penn, Gary Sinise, ...","A ""slow"" man explains to people he meets on a ..."
6,7,Lord of the Rings: The Fellowship of the Ring,2001,3.3,English,"[Adventure, Fantasy]",PG-13,[Peter Jackson],"[Elijah Wood, Ian McKellen, Viggo Mortensen,...",A Hobbit by the name of Frodo is entrusted wit...
7,8,Lord of the Rings: The Return of the King,2003,3.4,English,"[Adventure, Fantasy]",PG-13,[Peter Jackson],"[Elijah Wood, Viggo Mortensen, Ian McKellen,...","Frodo and Sam try to destroy ""the one ring"" wh..."
8,9,The Matrix,1999,3.2,English,"[Sci-Fi, Action]",R,"[Lana Wachowski, Lilly Wachowski]","[Keanu Reeves, Carrie-Anne Moss, Laurence Fi...","A computer programmer, Neo, finds that things ..."
9,10,Schindler's List,1993,3.4,English,"[Drama, Biography]",R,[Steven Spielberg],"[Liam Neeson, Ben Kingsley, Ralph Fiennes, ...",Schindler (played by Liam Neeson) tries to sav...


We will now check how many unique actors, directors, films 

In [9]:
from collections import Counter

flattened_actors = movies.explode('actors')
unique_actors = flattened_actors['actors'].dropna().drop_duplicates()
flattened_directors = movies.explode('director')
unique_directors = flattened_directors['director'].dropna().drop_duplicates()
n_films = movies['film_title'].nunique()
print(f"Number of films: {n_films}")
print(f"Number of unique directors: {len(unique_directors)}")
print(f"Number of unique actors: {len(unique_actors)}")






Number of films: 999
Number of unique directors: 599
Number of unique actors: 4096


1000 films have been scraped but 999 are returned as unique. This will now be checked to ensure there are no errors or inconsistencies

In [10]:
duplicate_titles = movies['film_title'].value_counts()
duplicate_titles = duplicate_titles[duplicate_titles > 1]
print(duplicate_titles)


film_title
True Grit    2
Name: count, dtype: int64


In [11]:
duplicate_titles_df = movies[movies['film_title'].isin(duplicate_titles.index)]
duplicate_titles_df

Unnamed: 0,rank,film_title,film_year,overall_rating,language,genre,mpaa_rating,director,actors,plot_summary
298,299,True Grit,2010,3.1,English,"[Drama, Western]",PG-13,"[Ethan Coen, Joel Coen]","[Matt Damon, Jeff Bridges, Josh Brolin, Bar...",The Coen Brothers' take on the classic western...
672,673,True Grit,1969,3.0,English,"[Adventure, Western]",G,[Henry Hathaway],"[John Wayne, Glen Campbell, Kim Darby, Robe...",A grizzled and often-times drunk U.S. Marshal ...


It seems like True Grit has had two productions within the timespan of approximately 40 years with different directors and actors. There was no error in the scraping and this is the only duplicate film title.
Unique mpaa ratings will now be checked.

In [12]:
movies.mpaa_rating.value_counts()

mpaa_rating
R        413
PG       200
PG-13    192
NR       148
G         44
NC-17      1
Name: count, dtype: int64

For the purpose of this analysis, an additional feature can be engineered with the meaning of the mpaa_rating as per [Wikipedia](https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system). The category will be assigned to a dictionary with the corresponding mpaa_code to then engineer another categorical column.


In [13]:
mpaa_ratings_dict = {
    'G': 'General Audience',
    'PG': 'Parental Guidance Suggested',
    'PG-13': 'Parents Strongly Cautioned',
    'R': 'Restricted',
    'NC-17': 'Adults Only'
}

In [14]:
movies['mpaaa_cat'] = movies['mpaa_rating'].map(mpaa_ratings_dict)
movies.head()


Unnamed: 0,rank,film_title,film_year,overall_rating,language,genre,mpaa_rating,director,actors,plot_summary,mpaaa_cat
0,1,The Shawshank Redemption,1994,3.5,English,[Drama],R,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, W...","A young banker named Andy, has his life change...",Restricted
1,2,Fight Club,1999,3.3,English,"[Drama, Crime]",R,[David Fincher],"[Edward Norton, Brad Pitt, Helena Bonham Car...",An average thirty-something office worker gets...,Restricted
2,3,The Godfather,1972,3.5,English,"[Drama, Crime]",R,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Robe...",An organized crime leader gives up control of ...,Restricted
3,4,The Dark Knight,2008,3.4,English,"[Action, Adventure]",PG-13,[Christopher Nolan],"[Christian Bale, Heath Ledger, Aaron Eckhart...",Batman raises the stakes in his war on crime. ...,Parents Strongly Cautioned
4,5,Pulp Fiction,1994,3.4,English,"[Crime, Drama]",R,[Quentin Tarantino],"[John Travolta, Samuel L. Jackson, Bruce Wil...",A story of four interweaving stories developin...,Restricted


It is possible to engineer an additional feature to compute how old is  the film, as film_year is available.

In [15]:
movies['film_age'] = 2025 - movies['film_year']

In [16]:
movies.head()

Unnamed: 0,rank,film_title,film_year,overall_rating,language,genre,mpaa_rating,director,actors,plot_summary,mpaaa_cat,film_age
0,1,The Shawshank Redemption,1994,3.5,English,[Drama],R,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, W...","A young banker named Andy, has his life change...",Restricted,31
1,2,Fight Club,1999,3.3,English,"[Drama, Crime]",R,[David Fincher],"[Edward Norton, Brad Pitt, Helena Bonham Car...",An average thirty-something office worker gets...,Restricted,26
2,3,The Godfather,1972,3.5,English,"[Drama, Crime]",R,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Robe...",An organized crime leader gives up control of ...,Restricted,53
3,4,The Dark Knight,2008,3.4,English,"[Action, Adventure]",PG-13,[Christopher Nolan],"[Christian Bale, Heath Ledger, Aaron Eckhart...",Batman raises the stakes in his war on crime. ...,Parents Strongly Cautioned,17
4,5,Pulp Fiction,1994,3.4,English,"[Crime, Drama]",R,[Quentin Tarantino],"[John Travolta, Samuel L. Jackson, Bruce Wil...",A story of four interweaving stories developin...,Restricted,31


The dataset is now better suited for analysis. Some insights will now be gathered with regards to the numerical columns (film_year, film_age and overall_rating). 

In [17]:
numerical_columns = ['film_year', 'film_age', 'overall_rating']
movies[numerical_columns].describe()


Unnamed: 0,film_year,film_age,overall_rating
count,1000.0,1000.0,1000.0
mean,1991.565,33.435,3.1178
std,24.026924,24.026924,0.130077
min,1902.0,1.0,2.8
25%,1975.0,14.0,3.0
50%,1999.0,26.0,3.1
75%,2011.0,50.0,3.2
max,2024.0,123.0,3.6


In [18]:
for col in numerical_columns:
    fig = px.box(movies, y = col, points='all')
    fig.show()
