In [None]:
# Movie Ratings Analysis

This notebook analyzes the MovieLens dataset to explore movie ratings, genres, tags, and external links.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style='whitegrid')

# Load data
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
links = pd.read_csv('../data/links.csv')
tags = pd.read_csv('../data/tags.csv')

# Preview data
print('Movies:')
print(movies.head())
print('\nRatings:')
print(ratings.head())
print('\nLinks:')
print(links.head())
print('\nTags:')
print(tags.head())

In [None]:
# Clean data
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')
movies['genres'] = movies['genres'].str.split('|')

# Merge datasets
df = pd.merge(ratings, movies, on='movieId', how='left')
df = pd.merge(df, links, on='movieId', how='left')
print(df.head())

In [None]:
# Rating distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['rating'], bins=10, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('../figures/rating_distribution.png')
plt.show()

In [None]:
# Top-rated movies (min 50 ratings)
movie_stats = df.groupby('title').agg({'rating': ['mean', 'count']})
movie_stats.columns = ['avg_rating', 'num_ratings']
top_movies = movie_stats[movie_stats['num_ratings'] >= 50].sort_values('avg_rating', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_movies['avg_rating'], y=top_movies.index)
plt.title('Top 10 Movies by Average Rating (Min 50 Ratings)')
plt.xlabel('Average Rating')
plt.ylabel('Movie Title')
plt.savefig('../figures/top_movies.png')
plt.show()

In [None]:
# Genre analysis
df_exploded = df.explode('genres')
genre_stats = df_exploded.groupby('genres')['rating'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=genre_stats.values, y=genre_stats.index)
plt.title('Average Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.savefig('../figures/genre_ratings.png')
plt.show()

In [None]:
# Tag frequency analysis
tag_counts = tags['tag'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=tag_counts.values, y=tag_counts.index)
plt.title('Top 10 Most Common Tags')
plt.xlabel('Count')
plt.ylabel('Tag')
plt.savefig('../figures/top_tags.png')
plt.show()

In [None]:
# Tags for top-rated movies
top_movie_titles = top_movies.index
top_movie_ids = movies[movies['title'].isin(top_movie_titles)]['movieId']
top_movie_tags = tags[tags['movieId'].isin(top_movie_ids)][['movieId', 'tag']]
top_movie_tags = pd.merge(top_movie_tags, movies[['movieId', 'title']], on='movieId')
tag_summary = top_movie_tags.groupby(['title', 'tag']).size().reset_index(name='count')
top_tags_per_movie = tag_summary.groupby('title').apply(lambda x: x.nlargest(3, 'count')).reset_index(drop=True)

print('Top Tags for Top-Rated Movies:')
print(top_tags_per_movie)