In [27]:
# Libraries
import pandas as pd 
import numpy as np
pd.options.mode.chained_assignment = None

In [28]:
# - 1.1. Download the data set movie_metadata.csv, which contains data about films from IMDb
# (Internet Movie Database).
df = pd.read_csv("../files/movie_metadata.csv")

#### Exploring the dataframe 

In [29]:
df.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0$,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0$,2007.0,5000.0,7.1,2.35,0


In [30]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

#### Main points of the task

Note: Each cell above has the point to resolve

In [31]:
# 1.2. The duration column contains data on the film length. How many missing values are there  in this column?
na_duration = df['duration'].isna().sum()
print(f"There is {na_duration} missing values on the column duration")

There is 15 missing values on the column duration


In [32]:
# 1.3. Replace the missing values in the duration column with the median value for this column.
df['duration'].fillna((df['duration'].mean()), inplace=True)

In [33]:
# 1.4. What is the average film length? Give the answer as a floating-point figure rounded to two decimal places.
average_duration = df['duration'].mean().round(2)
print(f"The average duration of the films is {average_duration}")

The average duration of the films is 107.2


In [34]:
# 1.5. Create a movie_duration_category column, which will contain three categories
# depending on the film length:
#   • Category "1. <90" if the film is less than 90 minutes long
#   • Category "2. 90–120" if the film is between 90 minutes and two hours long (inclusively)
#   • Category "3. >120" if the film is more than two hours long

def map_duration_category(x): 
    """
    Map the categories in each bucket of duration
    """
    if x < 90: 
        return '<90'
    elif 90 <= x <= 120: 
        return '90–120'
    else: 
        return '>120'
    
df['movie_duration_category'] = df['duration'].map(map_duration_category)


In [35]:
# 1.6. Build a summary table for films released after 2000 (inclusively), to list the numbers of
# films:
#   • Table rows: year
#   • Table columns: movie duration category ("<90", "90–120", ">120")
#   • The year of release should be displayed in the YYYY format.

recent_films = df[df['title_year'] >= 2000]
recent_films['release_year'] = recent_films['title_year'].astype(int).astype(str)
table_resume = recent_films[['release_year', 'movie_duration_category']].pivot_table(index='release_year', columns=['movie_duration_category'], aggfunc=lambda x: len(x))
table_resume.head(10)

movie_duration_category,90–120,<90,>120
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,112,25,34
2001,120,29,39
2002,146,36,27
2003,108,31,30
2004,142,30,42
2005,142,31,48
2006,146,40,53
2007,130,31,43
2008,160,29,36
2009,178,42,40


In [36]:
# 1.7. How many films between 90 minutes and two hours long were released in 2008?
condition = (df['title_year'] == 2008) & (df['duration'].between(90, 120))
print(f"There are {df[condition].shape[0]} films between 90 minutes and two hours long that were released in 2008")

There are 160 films between 90 minutes and two hours long that were released in 2008


In [37]:
df['plot_keywords'].sample(2)

4273    fellatio|seven word title|shoe salesman|title ...
1575               africa|hunter|love|marriage|plantation
Name: plot_keywords, dtype: object

In [38]:
# 1.8. The plot_keywords column holds keywords characterizing the film's plot. Using the data
# in this column, create a column called movie_plot_category, to contain four categories
#   depending on the key words in the column:
#    • Category "love_and_death" if the keywords include both "love" and "death"
#    • Category "love" if the keywords include the word "love"
#    • Category "death" if the keywords include the word "death"
#    • Category "other" if the keywords do not meet the conditions above

def movie_plot_category(x): 
    if x is None or str(x) == 'nan': 
        return 'other'
    elif 'love' in x and 'death' in x: 
        return 'love_and_death'
    elif 'love' in x: 
        return 'love'
    elif 'death' in x: 
        return 'death'
    else: 
        return 'other'

# Making sure all is in lower case 
df['plot_keywords'] = df['plot_keywords'].str.lower()
df['movie_plot_category'] = df['plot_keywords'].map(movie_plot_category)


In [39]:
# Check general movie_plot_category
df['movie_plot_category'].value_counts()

other             4621
love               234
death              172
love_and_death      16
Name: movie_plot_category, dtype: int64

In [40]:
# Check general movie_plot_category
print("Love category example: ", df[df['movie_plot_category']=='love']['plot_keywords'].iloc[0])
print("Love & death category example: ", df[df['movie_plot_category']=='love_and_death']['plot_keywords'].iloc[-1])
print("other category example: ", df[df['movie_plot_category']=='other']['plot_keywords'].iloc[0])

Love category example:  blood|book|love|potion|professor
Love & death category example:  coroner|death|gay son|love|mistaken identity
other category example:  avatar|future|marine|native|paraplegic


In [41]:
# 1.9. The imdb_score column shows a viewer rating for the film. Build a table to reflect the
# average rating of films depending on which movie_plot_category category they belong to.
#print(df[['imdb_score', 'movie_plot_category']].sample(3))

films_average_rating = df.groupby('movie_plot_category').imdb_score.mean().reset_index()
films_average_rating

Unnamed: 0,movie_plot_category,imdb_score
0,death,6.535465
1,love,6.580769
2,love_and_death,6.50625
3,other,6.431422


In [42]:
# 1.10. What is the average rating of films in the "love" category? Give the answer as a floatingpoint
# figure rounded to two decimal places.
map_category = films_average_rating['movie_plot_category'] == 'love'
print(f"The average rating films of love is {films_average_rating[map_category].iloc[0].imdb_score:.{2}f}")

The average rating films of love is 6.58


In [43]:
# 1.11. The budget column contains the film's budget. What is the median budget for all the films
# listed? Give the answer as an integer.

df['clean_budget'] = df['budget'].map(lambda x: float(x.replace('$', '')))
print(f"The median budget for all films is {int(df['clean_budget'].median())} $")

The median budget for all films is 15000000 $
