# Find Average Rating for each Movie as rated by our Users Sub-section

In [2]:
import pandas as pd

For each movie in the merged DataFrame, we want to get the average rating from our user_ratings table and append it as a column to the merged DataFrame.

In [3]:
user_ratings = pd.read_csv(
    "../tests/test_data/expected_user_ratings_clean_results.csv"
)

merged_clean = pd.read_csv(
    "../tests/test_data/expected_merged_clean_results.csv"
)

In [4]:
avg_ratings = user_ratings.groupby("movie_id")["rating_val"].mean().reset_index()
avg_ratings.rename(columns={"rating_val": "power_users_rating"}, inplace=True)

avg_ratings.head()

Unnamed: 0,movie_id,power_users_rating
0,10-cloverfield-lane,7.385838
1,10-things-i-hate-about-you,7.443325
2,12-angry-men,9.188402
3,12-years-a-slave,8.245345
4,127-hours,7.150172


Let's change the power_users_rating column to 2 decimal places to keep it in line with the rating column.

In [5]:
avg_ratings["power_users_rating"] = avg_ratings["power_users_rating"].round(2)
avg_ratings.head()

Unnamed: 0,movie_id,power_users_rating
0,10-cloverfield-lane,7.39
1,10-things-i-hate-about-you,7.44
2,12-angry-men,9.19
3,12-years-a-slave,8.25
4,127-hours,7.15


In [6]:
merged_clean_enriched = merged_clean.merge(avg_ratings, on="movie_id", how="left")
merged_clean_enriched.head()

Unnamed: 0,movie_id,movie_title,genres,original_language,image_url,runtime,spoken_languages,year_released,rating,power_users_rating
0,napoleon-dynamite,Napoleon Dynamite,['Comedy'],en,sm/upload/wu/r9/ma/tt/2VMXuUAvU8T0oQl0w77CqVAR...,95,['English'],2004,7.36,6.79
1,insomnia-2002,Insomnia,"['Crime', 'Mystery', 'Thriller']",en,film-poster/5/1/7/3/9/51739-insomnia-0-230-0-3...,118,['English'],2002,7.06,6.76
2,a-bugs-life,A Bug's Life,"['Adventure', 'Animation', 'Comedy', 'Family']",en,film-poster/4/7/1/1/1/47111-a-bug-s-life-0-230...,95,['English'],1998,6.9,6.59
3,before-midnight,Before Midnight,"['Romance', 'Drama']",en,film-poster/1/0/2/4/4/4/102444-before-midnight...,109,"['English', 'ελληνικά', 'Français']",2013,8.14,8.47
4,searching-2018,Searching,"['Thriller', 'Mystery', 'Drama']",en,film-poster/4/2/0/4/6/7/420467-searching-0-230...,102,['English'],2018,7.44,7.41


# Rating Count by our Users Sub-section for each Movie

I also want to add the number of ratings our power-users made for each movie to determine the popularity of each movie.

In [7]:
rating_count = user_ratings.groupby("movie_id")["rating_val"].count()

rating_count.head()

movie_id
10-cloverfield-lane           3587
10-things-i-hate-about-you    2779
12-angry-men                  3604
12-years-a-slave              3061
127-hours                     2031
Name: rating_val, dtype: int64

In [8]:
merged_clean_enriched = merged_clean_enriched.merge(rating_count.rename("rating_count"), on="movie_id", how="left")
merged_clean_enriched.head()

Unnamed: 0,movie_id,movie_title,genres,original_language,image_url,runtime,spoken_languages,year_released,rating,power_users_rating,rating_count
0,napoleon-dynamite,Napoleon Dynamite,['Comedy'],en,sm/upload/wu/r9/ma/tt/2VMXuUAvU8T0oQl0w77CqVAR...,95,['English'],2004,7.36,6.79,2101
1,insomnia-2002,Insomnia,"['Crime', 'Mystery', 'Thriller']",en,film-poster/5/1/7/3/9/51739-insomnia-0-230-0-3...,118,['English'],2002,7.06,6.76,2089
2,a-bugs-life,A Bug's Life,"['Adventure', 'Animation', 'Comedy', 'Family']",en,film-poster/4/7/1/1/1/47111-a-bug-s-life-0-230...,95,['English'],1998,6.9,6.59,2608
3,before-midnight,Before Midnight,"['Romance', 'Drama']",en,film-poster/1/0/2/4/4/4/102444-before-midnight...,109,"['English', 'ελληνικά', 'Français']",2013,8.14,8.47,2768
4,searching-2018,Searching,"['Thriller', 'Mystery', 'Drama']",en,film-poster/4/2/0/4/6/7/420467-searching-0-230...,102,['English'],2018,7.44,7.41,2775


I'm happy with this table now, let's export the result of the merge into a CSV file so we can test against it in the pipeline.

In [9]:
merged_clean_enriched.to_csv(
    "../tests/test_data/expected_merged_enriched_results.csv", index=False
)