In [None]:
# Movie Ratings Analysis

This notebook analyzes the MovieLens dataset to explore movie ratings, genres, tags, and external links.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style='whitegrid')

# Load data
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
links = pd.read_csv('../data/links.csv')
tags = pd.read_csv('../data/tags.csv')

# Preview data
print('Movies:')
print(movies.head())
print('\nRatings:')
print(ratings.head())
print('\nLinks:')
print(links.head())
print('\nTags:')
print(tags.head())

In [None]:
# Clean data
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')
movies['genres'] = movies['genres'].str.split('|')

# Merge datasets
df = pd.merge(ratings, movies, on='movieId', how='left')
df = pd.merge(df, links, on='movieId', how='left')
print(df.head())

In [None]:
# Rating distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['rating'], bins=10, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('../figures/rating_distribution.png')
plt.show()

In [None]:
# Top-rated movies (min 50 ratings)
movie_stats = df.groupby('title').agg({'rating': ['mean', 'count']})
movie_stats.columns = ['avg_rating', 'num_ratings']
top_movies = movie_stats[movie_stats['num_ratings'] >= 50].sort_values('avg_rating', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_movies['avg_rating'], y=top_movies.index)
plt.title('Top 10 Movies by Average Rating (Min 50 Ratings)')
plt.xlabel('Average Rating')
plt.ylabel('Movie Title')
plt.savefig('../figures/top_movies.png')
plt.show()

In [None]:
# Genre analysis
df_exploded = df.explode('genres')
genre_stats = df_exploded.groupby('genres')['rating'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=genre_stats.values, y=genre_stats.index)
plt.title('Average Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.savefig('../figures/genre_ratings.png')
plt.show()

In [None]:
# Tag frequency analysis
tag_counts = tags['tag'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=tag_counts.values, y=tag_counts.index)
plt.title('Top 10 Most Common Tags')
plt.xlabel('Count')
plt.ylabel('Tag')
plt.savefig('../figures/top_tags.png')
plt.show()

In [None]:
# Tags for top-rated movies
top_movie_titles = top_movies.index
top_movie_ids = movies[movies['title'].isin(top_movie_titles)]['movieId']
top_movie_tags = tags[tags['movieId'].isin(top_movie_ids)][['movieId', 'tag']]
top_movie_tags = pd.merge(top_movie_tags, movies[['movieId', 'title']], on='movieId')
tag_summary = top_movie_tags.groupby(['title', 'tag']).size().reset_index(name='count')
top_tags_per_movie = tag_summary.groupby('title').apply(lambda x: x.nlargest(3, 'count')).reset_index(drop=True)

print('Top Tags for Top-Rated Movies:')
print(top_tags_per_movie)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np

# Create tag profiles for each movie
tag_profiles = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.str.lower())).reset_index()
tag_profiles = pd.merge(tag_profiles, movies[['movieId', 'title']], on='movieId', how='left')

# Convert tags to TF-IDF features
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(tag_profiles['tag'])

# Apply K-means clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
tag_profiles['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tfidf_matrix.toarray())
tag_profiles['pca1'] = pca_result[:, 0]
tag_profiles['pca2'] = pca_result[:, 1]

# Plot clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='pca1', y='pca2', hue='cluster', palette='deep', data=tag_profiles, legend='full')
plt.title('Movie Clusters Based on Tags (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.savefig('../figures/tag_clusters.png')
plt.show()

# Display sample movies per cluster
for cluster in range(n_clusters):
    print(f'\nCluster {cluster}:')
    sample_movies = tag_profiles[tag_profiles['cluster'] == cluster]['title'].head(5)
    print(sample_movies.tolist())