In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
# ml-latest-small dataset from https://grouplens.org/datasets/movielens/latest/
csv1 = os.path.join('movie_data', 'ml-latest-small', 'movies.csv')
csv2 = os.path.join('movie_data', 'ml-latest-small', 'ratings.csv')

In [5]:
movies_df = pd.read_csv(csv1)
ratings_df = pd.read_csv(csv2)

In [6]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
action_df = movies_df[movies_df['genres'].str.contains('Action')]

In [9]:
ratings_df[ratings_df['movieId'].isin(action_df['movieId'])]\
    .loc[:, ['userId','rating']].groupby(['userId'])['rating'].mean()

userId
1      4.322222
2      3.954545
3      3.571429
4      3.320000
5      3.111111
         ...   
606    3.178808
607    3.722222
608    3.330325
609    3.090909
610    3.600580
Name: rating, Length: 608, dtype: float64

In [10]:
def get_genre_ratings(genres):
    genre_ratings = pd.DataFrame()
    column_names = []
    for genre in genres:        
        genre_movies = movies_df[movies_df['genres'].str.contains(genre) ]
        avg_genre_votes_per_user = ratings_df[ratings_df['movieId'].isin(genre_movies['movieId'])]\
            .loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
        
        genre_ratings = pd.concat([genre_ratings, avg_genre_votes_per_user], axis=1)
        column = f'avg_{genre.lower()}_rating'
        column_names.append(column)
    genre_ratings.columns = column_names
    return genre_ratings

In [11]:
genre_ratings_df = get_genre_ratings(['Drama', 'Romance'])
genre_ratings_df.head()

Unnamed: 0,avg_drama_rating,avg_romance_rating
1,4.53,4.31
2,3.88,4.5
3,0.75,0.5
4,3.48,3.38
5,3.8,3.09
