###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
DATASETS = pathlib.Path('/media/alejand/DatasetsT7/datasets')

In [3]:
# Loads raw movies info
movies = pd.read_csv('data/ml-20m/encoded_movies.csv', index_col='movieId')
movies.rename(columns={'(no genres listed)': 'Unknown'}, inplace=True)
movies.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,Unknown,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,Waiting to Exhale,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
5,Father of the Bride Part II,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [4]:
genre_counts = movies.iloc[:,1:-1].sum().sort_index(ascending=True).to_frame().reset_index()
genre_counts.rename(columns={'index': 'Genre', 0: 'Count'}, inplace=True)
genre_counts

Unnamed: 0,Genre,Count
0,Action,3520
1,Adventure,2329
2,Animation,1027
3,Children,1139
4,Comedy,8374
5,Crime,2939
6,Documentary,2471
7,Drama,13344
8,Fantasy,1412
9,Film-Noir,330


In [5]:
alt.Chart(genre_counts).mark_bar().encode(
    y='Genre:N',
    x='Count:Q',
    color='Genre'
).interactive()

In [6]:
# Loads user ratings
ratings = pd.read_csv(DATASETS / 'recommender/movies/ml-20m/ratings.csv').drop(columns=['timestamp'])
ratings = ratings.set_index(['userId', 'movieId'])
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,2,3.5
1,29,3.5
1,32,3.5
1,47,3.5
1,50,3.5


In [7]:
grouped = ratings.groupby(level='userId')

In [8]:
alt.Chart(grouped.count().reset_index()).mark_bar().encode(
    x=alt.X('rating', bin=alt.Bin(maxbins=10), title='Number of ratings per user'),
    y=alt.Y('userId', aggregate='count', scale=alt.Scale(type='log'))
).interactive()

In [9]:
alt.Chart(grouped.mean().reset_index()).mark_bar().encode(
    x=alt.X('rating', bin=alt.Bin(maxbins=10), title='Mean user rating'),
    y=alt.Y('userId', aggregate='count')
).interactive()

In [19]:
# Finds highest rated movie per user (if there is a tie, we put first the one with the lowest avg rating)
movie_avg_rating = ratings.groupby(level='movieId').mean()


Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.921240
2,3.211977
3,3.151040
4,2.861393
5,3.064592
...,...
131254,4.000000
131256,4.000000
131258,2.500000
131260,3.000000
