# Movie Lens EDA
Source https://files.grouplens.org/datasets/movielens/ml-20m-README.html

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"

## Movies
This dataset is a subset of the [MovieLens 20M Dataset](https://grouplens.org/datasets/movielens/20m/), which contains 20 million ratings and 465,000 tag applications applied to 27,000 movies by 138,000 users. The data was collected from the MovieLens website, a movie recommendation service.

**movies.parquet**
- Contains movie information such as movieId, title, and genres.
- Each movie is identified by a unique `movieId`.
- The `genres` field is a pipe-separated list of genres associated with the movie.

**Fields:**
- `movieId`: Unique identifier for a movie.
- `title`: Movie title with release year in parentheses.
- `genres`: Pipe-separated list of genres (e.g., Adventure|Animation|Children|Comedy|Fantasy).

In [None]:
movies_df = pd.read_parquet("./data/parquet/movies.parquet")
movies_df["genres"] = movies_df["genres"].apply(lambda x: x.split("|") if isinstance(x, str) else [])
# Extract release year from the title and add as a new column 'year'
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)').astype(float)
movies_df["genre_count"] = movies_df["genres"].apply(len)
movies_df.info()
movies_df.head()

### Movies per release year

In [None]:
# Plot the number of movies released per year using Plotly
# Calculate the number of movies per year
plot_df = movies_df['year'].value_counts().sort_index()

# Create an interactive bar chart with Plotly
fig = px.bar(
    plot_df.reset_index(),
    x='year',
    y='count',
    title='Number of Movies Released Per Year',
)
# Add a trendline to see the overall pattern
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Movies',
)

# Display the plot
fig.show()

### Movie Genres

In [None]:
plot_df = movies_df['genre_count'].value_counts().sort_index()

# Create an interactive bar chart with Plotly
fig = px.bar(
    plot_df.reset_index(),
    x='genre_count',
    y='count',
    title='Number of Movies per Number of Genres',
    log_y=True  # Apply log scale to y-axis
)
# Add a trendline to see the overall pattern
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Movies',
)

In [None]:
plot_df = movies_df['genres'].explode().value_counts().reset_index()
plot_df['count'] = plot_df['count'] / movies_df.shape[0] * 100  # Convert to percentage
fig = px.bar(
    plot_df.reset_index(),
    x='genres',
    y='count',
    title='Percentage of Movies per Genre',
)
# Add a trendline to see the overall pattern
fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Percentage of Movies',
    xaxis_tickangle=-45  # Rotate x-axis labels for better readability
)

## Ratings
The ratings data is sourced from the [MovieLens 20M Dataset](https://grouplens.org/datasets/movielens/20m/). It contains user ratings for movies, where each user has rated at least 20 movies.

**ratings.parquet**
- Contains the ratings given by users to movies.
- Each row represents a single rating event.

**Fields:**
- `userId`: Unique identifier for a user.
- `movieId`: Unique identifier for a movie.
- `rating`: The rating given by the user (0.5 to 5.0 in 0.5 increments).
- `timestamp`: Unix timestamp of when the rating was made.

In [None]:
ratings_df = pd.read_parquet("./data/parquet/ratings.parquet")
ratings_df.info()
ratings_df.head()

In [None]:
# Find movies that don't have any ratings
movies_with_ratings = ratings_df['movieId'].unique()
movies_without_ratings = movies_df[~movies_df['movieId'].isin(movies_with_ratings)]

# Display the count and the first few rows
print(f"Total movies: {len(movies_df)}")
print(f"Movies with ratings: {len(movies_with_ratings)}")
print(f"Movies without ratings: {len(movies_without_ratings)}")

# Display sample of movies without ratings
movies_without_ratings

In [None]:
rated_movies_df = (
    movies_df.set_index("movieId")
    .join(
        ratings_df.set_index("movieId"),
        how="inner",
        lsuffix="_movie",
        rsuffix="_rating",
    )
    .reset_index()
)
rated_movies_df.info(memory_usage="deep")

In [None]:
print(f"Number of unique users: {rated_movies_df['userId'].nunique():,}")

In [None]:
rated_movies_df.isnull().sum()

### Ratings per user

In [None]:
# Calculate the number of ratings per user
ratings_per_user = ratings_df['userId'].value_counts().reset_index()
ratings_per_user.columns = ['userId', 'num_ratings']

# Plot the distribution using Plotly
fig = px.histogram(
    ratings_per_user,
    x='num_ratings',
    title='Distribution of Number of Ratings per User',
    labels={'num_ratings': 'Number of Ratings'},
    log_y=True  # Log scale for better visualization
)
fig.update_layout(
    xaxis_title='Number of Ratings per User',
    yaxis_title='Number of Users'
)
fig.show()

In [None]:
ratings_per_user = ratings_df['userId'].value_counts().reset_index()
ratings_per_user.columns = ['userId', 'num_ratings']

# Plot the distribution using Plotly
fig = px.box(
    ratings_per_user,
    x='num_ratings',
    title='Distribution of Number of Ratings per User',
    labels={'num_ratings': 'Number of Ratings'},
    log_x=True
)
fig.update_layout(
    xaxis_title='Number of Ratings per User',
)
fig.show()

### Ratings per Movie

In [None]:
# Calculate the number of ratings per movie
ratings_per_movie = ratings_df['movieId'].value_counts().reset_index()
ratings_per_movie.columns = ['movieId', 'num_ratings']

# Plot the distribution using Plotly
fig = px.histogram(
    ratings_per_movie,
    x='num_ratings',
    title='Distribution of Number of Ratings per Movie',
    labels={'num_ratings': 'Number of Ratings'},
    log_y=True  # Log scale for better visualization
)
fig.update_layout(
    xaxis_title='Number of Ratings per Movie',
    yaxis_title='Number of Movies'
)
fig.show()

In [None]:
# Calculate the number of ratings per movie
ratings_per_movie = ratings_df['movieId'].value_counts().reset_index()
ratings_per_movie.columns = ['movieId', 'num_ratings']

# Plot the distribution using Plotly
fig = px.box(
    ratings_per_movie,
    x='num_ratings',
    title='Distribution of Number of Ratings per Movie',
    labels={'num_ratings': 'Number of Ratings'},
    log_x=True  # Log scale for better visualization
)
fig.update_layout(
    xaxis_title='Number of Ratings per Movie',
)
fig.show()

### Ratings score distribution

In [None]:
# Visualize the distribution of ratings
# Calculate the percentage of each rating value
rating_counts = ratings_df['rating'].value_counts(normalize=True).sort_index() * 100
fig = px.bar(
    rating_counts.reset_index(),
    x='rating',
    y='rating',
    title='Distribution of Ratings',
    labels={'rating': 'Rating', 'index': 'Rating', 'rating': 'Percentage of Ratings'}
)
fig.update_layout(
    xaxis_title='Rating',
    yaxis_title='Percentage of Ratings'
)

fig.show()

### Ratings per genre

In [None]:
# Explode genres so each rating is associated with each genre of the movie
plot_df = rated_movies_df.explode('genres')
plot_df = plot_df.sample(frac=0.1)

# Create a boxplot of ratings per genre using Plotly
# Also calculate and plot the mean rating per genre
mean_ratings = plot_df.groupby('genres')['rating'].mean().reset_index()
mean_ratings = mean_ratings.rename(columns={'rating': 'mean_rating'})

fig = px.box(
    plot_df,
    x='genres',
    y='rating',
    title='Distribution of Ratings per Genre',
    labels={'genres': 'Genre', 'rating': 'Rating'}
)
fig.add_scatter(
    x=mean_ratings['genres'],
    y=mean_ratings['mean_rating'],
    mode='markers',
    name='Mean Rating',
    marker=dict(color='red', size=8)
)

fig.update_layout(
    xaxis_tickangle=-45
)
fig.show()

### Highest rated movies

In [None]:
# Todo Check why this is not working
plot_df = rated_movies_df.groupby(["title"]).agg({
    "rating": "mean",
    "movieId": "count"
}).reset_index()

# Get the top 20 movies by average rating
plot_df = plot_df[plot_df["movieId"] >= 1000].sort_values("rating", ascending=False).head(10)

plot_df["title"] = plot_df["title"] + " (" + plot_df["movieId"].astype(str) + " ratings)"

# Plot using Plotly
fig = px.bar(
    plot_df.sort_values("rating"),
    x="rating",
    y="title",
    orientation="h",
    title="Top 10 Highest Rated Movies (at least 1000 ratings)",
    labels={"rating": "Average Rating", "title": "Movie Title"}
)
fig.show()

### Lowest rated movies


In [None]:
# Todo Check why this is not working
plot_df = rated_movies_df.groupby(["title"]).agg({
    "rating": "mean",
    "movieId": "count"
}).reset_index()

# Get the top 20 movies by average rating
plot_df = plot_df[plot_df["movieId"] >= 1000].sort_values("rating", ascending=True).head(10)

plot_df["title"] = plot_df["title"] + " (" + plot_df["movieId"].astype(str) + " ratings)"

# Plot using Plotly
fig = px.bar(
    plot_df.sort_values("rating"),
    x="rating",
    y="title",
    orientation="h",
    title="Top 10 Lowest Rated Movies (at least 1000 ratings)",
    labels={"rating": "Average Rating", "title": "Movie Title"}
)
fig.show()

### Number of ratings vs average rating

In [None]:
# Calculate average rating and number of ratings per movie
ratings_stats = ratings_df.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    num_ratings=('rating', 'count')
).reset_index()

# Merge with movies_df to get movie titles (optional)
ratings_stats = ratings_stats.merge(movies_df[['movieId', 'title']], on='movieId', how='left')

# Scatter plot: Number of ratings vs Average rating
fig = px.scatter(
    ratings_stats,
    x='num_ratings',
    y='avg_rating',
    hover_data=['title'],
    title='Number of Ratings vs Average Rating per Movie',
    labels={'num_ratings': 'Number of Ratings', 'avg_rating': 'Average Rating'},
    opacity=0.5,
    log_x=True
)
fig.update_layout(
    xaxis_title='Number of Ratings (log scale)',
    yaxis_title='Average Rating'
)
fig.show()

### Number of ratings per user vs average rating

In [None]:
# Calculate average rating and number of ratings per user
user_stats = ratings_df.groupby('userId').agg(
    avg_rating=('rating', 'mean'),
    num_ratings=('rating', 'count')
).reset_index()

# Scatter plot: Number of ratings per user vs Average rating
fig = px.scatter(
    user_stats,
    x='num_ratings',
    y='avg_rating',
    hover_data=['userId'],
    title='Number of Ratings per User vs Average Rating',
    labels={'num_ratings': 'Number of Ratings per User', 'avg_rating': 'Average Rating'},
    opacity=0.5,
    log_x=True
)
fig.update_layout(
    xaxis_title='Number of Ratings per User (log scale)',
    yaxis_title='Average Rating'
)
fig.show()

## Tags
Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.

**tags.parquet**
- Contains the tags applied to movies by users.
- Each row represents a single tag application.

**Fields:**
- `userId`: Unique identifier for a user.
- `movieId`: Unique identifier for a movie.
- `tag`: The tag text applied to the movie.
- `timestamp`: Unix timestamp of when the tag was applied.

In [None]:
tags_df = pd.read_parquet("./data/parquet/tags.parquet")
tags_df.info()
tags_df.head()

## Links
The links file provides identifiers that allow you to link MovieLens movies with other movie-related databases.

**links.parquet**
- Contains identifiers for external movie databases.

**Fields:**
- `movieId`: Unique identifier for a movie.
- `imdbId`: Identifier for the movie in the [IMDb](https://www.imdb.com/) database.
- `tmdbId`: Identifier for the movie in the [TMDb](https://www.themoviedb.org/) database.

In [None]:
links_df = pd.read_parquet("./data/parquet/links.parquet")
links_df.info()
links_df.head()

In [None]:
links_df = links_df.join(movies_df.set_index("movieId").add_prefix('movie_'), on='movieId', how="left")
links_df

In [None]:
links_df.isna().sum()

## Genome Tags
Genome tags are a set of tags that have been applied to movies in a consistent, controlled way. They are used in the MovieLens tag genome project to describe movies with a rich set of attributes.

**genome_tags.parquet**
- Contains the set of tags used in the tag genome.

**Fields:**
- `tagId`: Unique identifier for a tag.
- `tag`: The tag text.

In [None]:
genome_tags_df = pd.read_parquet("./data/parquet/genome_tags.parquet")
genome_tags_df.info()
genome_tags_df.head()


## Genome Scores
Genome scores quantify the relevance of each genome tag to each movie, as determined by user tagging activity.

**genome_scores.parquet**
- Contains the relevance scores for each tag for each movie.

**Fields:**
- `movieId`: Unique identifier for a movie.
- `tagId`: Unique identifier for a tag.
- `relevance`: Relevance score (ranging from 0 to 1) indicating how well the tag describes the movie.

In [None]:
genome_scores_df = pd.read_parquet("./data/parquet/genome_scores.parquet")
genome_scores_df.info()
genome_scores_df.head()