In [None]:
import io
import os
import pathlib
import requests
import zipfile

import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123)
import pandas as pd
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

In [None]:
ROOT_DIR = pathlib.Path("..")

DATASETS_URL = "https://github.com/KAUST-Academy/python-for-data-analysis/raw/november-2022/datasets.zip"

response = requests.get(DATASETS_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(ROOT_DIR)


In [None]:
DATASETS_DIR = ROOT_DIR / "datasets"
DATASET_DIR = DATASETS_DIR / "movielens"

In [None]:
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_table(DATASET_DIR / "users.dat", sep="::",
                      header=None, names=unames, engine="python")

rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table(DATASET_DIR / "ratings.dat", sep="::",
                        header=None, names=rnames, engine="python")

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table(DATASET_DIR / "movies.dat", sep="::",
                       header=None, names=mnames, engine="python")

In [None]:
users.head(5)
ratings.head(5)
movies.head(5)
ratings

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)
data
data.iloc[0]

In [None]:
mean_ratings = data.pivot_table("rating", index="title",
                                columns="gender", aggfunc="mean")
mean_ratings.head(5)

In [None]:
ratings_by_title = data.groupby("title").size()
ratings_by_title.head()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

In [None]:
mean_ratings = mean_ratings.rename(index={"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)":
                           "Seven Samurai (Shichinin no samurai) (1954)"})

In [None]:
top_female_ratings = mean_ratings.sort_values("F", ascending=False)
top_female_ratings.head()

In [None]:
mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]

In [None]:
sorted_by_diff = mean_ratings.sort_values("diff")
sorted_by_diff.head()

In [None]:
sorted_by_diff[::-1].head()

In [None]:
rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.head()

In [None]:
rating_std_by_title.sort_values(ascending=False)[:10]

In [None]:
movies["genres"].head()
movies["genres"].head().str.split("|")
movies["genre"] = movies.pop("genres").str.split("|")
movies.head()

In [None]:
movies_exploded = movies.explode("genre")
movies_exploded[:10]

In [None]:
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)
ratings_with_genre.iloc[0]
genre_ratings = (ratings_with_genre.groupby(["genre", "age"])
                 ["rating"].mean()
                 .unstack("age"))
genre_ratings[:10]