# Pandas data wrangling - Exercise 3

### 1. Go to https://www.kaggle.com/rounakbanik/the-movies-dataset?select=movies_metadata.csv and download the `movies_metadata.csv` file to your computer and unzip it. Load it to a DataFrame called `movies`

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import ast

filename = 'C:/Users/ardejong/Downloads/movies_metadata.csv'
movies = pd.read_csv(filename)
movies.head()

FileNotFoundError: [Errno 2] No such file or directory: 'arondejong99/master-data-science/module-4/movies_metadata.csv'

In [2]:
movies_df = movies.copy()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### 2. Count movies per decade

In [3]:
release_date = movies_df.release_date.dropna()
release_date = pd.to_datetime(release_date, errors = 'coerce')
release_decade = release_date.apply(lambda x: (x.year//10)*10).dropna().astype(int)
release_decade.value_counts().sort_index(ascending = False)

release_date
2020        1
2010    12799
2000    11207
1990     5677
1980     3931
1970     3472
1960     2622
1950     2080
1940     1494
1930     1317
1920      432
1910      176
1900       87
1890       75
1880        4
1870        2
Name: count, dtype: int64

### 3. Show average revenue depending on the weekday (Mon, Tue, ...) on the release date.

In [4]:
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors = 'coerce').dropna()
revenue = movies_df[['release_date', 'revenue']].dropna()
revenue['weekday'] = revenue.release_date.apply(lambda x: x.strftime("%A"))
rev_df = pd.DataFrame(revenue.groupby('weekday')['revenue'].mean()).reset_index()

# Create a dictionary to map weekdays to their names
weekday_nr = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}

rev_df['weekday_nr'] = rev_df['weekday'].map(weekday_nr)

rev_df = rev_df.sort_values('weekday_nr')

rev_df[['weekday', 'revenue']]

Unnamed: 0,weekday,revenue
1,Monday,7029549.0
5,Tuesday,11573450.0
6,Wednesday,19023080.0
4,Thursday,15192170.0
0,Friday,10728210.0
2,Saturday,3900390.0
3,Sunday,3817560.0


### 4. How many movies have "Story" (case-insensitive) in their title, for each decade?

In [5]:
titles = movies_df[['release_date', 'original_title']].dropna()
titles['release_decade'] = titles.release_date.apply(lambda x: (x.year//10)*10)
titles['contains_story'] = titles.original_title.str.contains("Story")
titles.groupby('release_decade')['contains_story'].sum()

release_decade
1870      0
1880      0
1890      0
1900      2
1910      0
1920      1
1930      6
1940     12
1950     30
1960      7
1970      5
1980     22
1990     40
2000     79
2010    117
2020      0
Name: contains_story, dtype: int64

### 5. For each genre, create a column with 1 or 0, depending on whether a movie belongs to it.

In [6]:
movies.genres

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
45461    [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
45462                        [{'id': 18, 'name': 'Drama'}]
45463    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464                                                   []
45465                                                   []
Name: genres, Length: 45466, dtype: object

In [7]:
movies_df['genres'] = movies_df['genres'].apply(ast.literal_eval)
unique_genres = set()
for genre_list in movies_df['genres']:
    for genre in genre_list:
        unique_genres.add(genre['name'])

for genre in unique_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if any(g['name'] == genre for g in x) else 0)

movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,Adventure,Foreign,Odyssey Media,Pulser Productions,Documentary,TV Movie,Rogue State,Vision View Entertainment,Comedy,Carousel Productions
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0,0,0,0,0,0,0,0,1,0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1,0,0,0,0,0,0,0,0,0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0,0,0,0,0,0,0,0,1,0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0,0,0,0,0,0,0,0,1,0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0,0,0,0,0,0,0,0,1,0


### 6. Drop all movies which don't have a hompage

In [8]:
print(movies_df.homepage.isnull().sum())
movies_df = movies_df.dropna(subset = ['homepage'])
print(movies_df.homepage.isnull().sum())

37684
0


### 7. Calculate net earnings per movie

In [9]:
subset = movies_df[['budget', 'revenue']]
subset = subset[~subset['budget'].str.contains('jpg')].astype(float)
subset['revenue'] - subset['budget']

0        343554033.0
9        294194034.0
24        46200000.0
46       294311859.0
49        17341568.0
            ...     
45391            0.0
45392            0.0
45395     -1200000.0
45398     -1254040.0
45461            0.0
Length: 7779, dtype: float64

### 8. Get three most profitable movies per original language

In [10]:
test = movies_df.groupby(['original_language']).apply(lambda x: x.nlargest(3, 'revenue')).reset_index(drop=True)
test = test.dropna()
test[['original_language', 'title', 'revenue']].head(20)

  test = movies_df.groupby(['original_language']).apply(lambda x: x.nlargest(3, 'revenue')).reset_index(drop=True)


Unnamed: 0,original_language,title,revenue
14,cn,Ip Man 2,36000000.0
21,da,Nymphomaniac: Vol. II,2227167.0
28,en,Avatar,2787965000.0
29,en,Star Wars: The Force Awakens,2068224000.0
60,id,The Raid,4105187.0
61,id,The Raid 2,2627209.0
68,it,"The Good, the Bad and the Ugly",6000000.0
94,nl,New Kids Turbo,8786756.0
96,no,Dead Snow,1984662.0
97,no,Dead Snow 2: Red vs. Dead,37473.0


In [11]:
prof_movies_per_lang = movies_df.groupby(['original_language']).apply(lambda x: x.nlargest(3, 'revenue')).reset_index(drop=True)
prof_movies_per_lang = prof_movies_per_lang[['original_language', 'title', 'revenue']]

prof_movies_per_lang.head(15)

  prof_movies_per_lang = movies_df.groupby(['original_language']).apply(lambda x: x.nlargest(3, 'revenue')).reset_index(drop=True)


Unnamed: 0,original_language,title,revenue
0,104.0,,
1,68.0,,
2,82.0,,
3,ab,Manson's Lost Girls,0.0
4,af,Tsotsi,9879971.0
5,af,Road to Your Heart,0.0
6,ar,Caramel,0.0
7,ar,Son of Babylon,0.0
8,ar,The Square,0.0
9,bg,Zift,0.0


### 9. Calculate the z-score of each movie's rating, relative to it's genre

In [13]:
# Create json dicts from genres 
test = movies_df.copy()
test['genres'] = test['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Step 2: Explode the genres to create a row for each genre
movies_exploded = test.explode('genres')
movies_exploded = movies_exploded[movies_exploded['genres'].notna()]

# Step 3: Extract genre name for easier access
movies_exploded['genre_name'] = movies_exploded['genres'].apply(lambda x: x['name'] if isinstance(x, dict) else None)

# Step 4: Calculate mean and standard deviation of ratings for each genre
genre_stats = movies_exploded.groupby('genre_name')['vote_average'].agg(['mean', 'std']).reset_index()

# Step 5: Merge the stats back to the exploded DataFrame
movies_exploded = movies_exploded.merge(genre_stats, on='genre_name', suffixes=('', '_stats'))

# Step 6: Calculate the z-score for each rating
movies_exploded['z_score'] = (movies_exploded['vote_average'] - movies_exploded['mean']) / movies_exploded['std']

# Step 7: Clean up DataFrame to get back to original format if needed
z_scores = movies_exploded.groupby(['title', 'genre_name'])['z_score'].mean().reset_index()

z_scores.head(10)

Unnamed: 0,title,genre_name,z_score
0,!Women Art Revolution,Documentary,-0.777608
1,#Horror,Drama,-1.929503
2,#Horror,Horror,-1.473767
3,#Horror,Mystery,-1.773763
4,#Horror,Thriller,-1.854977
5,#chicagoGirl,Documentary,0.391296
6,$9.99,Animation,-0.382176
7,$9.99,Drama,-0.092825
8,'Twas the Night Before Christmas,Animation,-0.461151
9,'Twas the Night Before Christmas,Family,-0.043627
