In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
movies = pd.read_csv('Movies.csv')
ratings = pd.read_csv('Ratings.csv')
users = pd.read_csv('Users.csv')

Top 10 Movies by Average Rating

In [None]:
# Merge ratings with movie titles
movie_ratings = ratings.merge(movies, on='MovieID')

# Get top 10 movies by average rating
avg_ratings = movie_ratings.groupby('Title')['Rating'].mean().sort_values(ascending=False).head(10)

# Convert to DataFrame for easier plotting
avg_ratings_df = avg_ratings.reset_index()

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(
    data=avg_ratings_df,
    x='Rating',
    y='Title',
    hue='Title',        
    palette='Set2',
    dodge=False,       
    legend=False        
)
plt.xlabel('Average Rating')
plt.title('Top 10 Movies by Average Ratings')
plt.tight_layout()
plt.show()

Top 10 Movies by Views

In [None]:
# Get top 10 most viewed movies
top_movies = movies.sort_values('TotalViews', ascending=False).head(10)

# Plot using hue=Title to avoid deprecation warning
plt.figure(figsize=(10, 5))
sns.barplot(
    data=top_movies,
    x='TotalViews',
    y='Title',
    hue='Title',        
    palette='Set2',
    dodge=False,
    legend=False         
)
plt.title('Top 10 Most Viewed Movies')
plt.xlabel('Total Views')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

Total Watch Time by Subscription status

In [None]:
# Step 1: Group and sum watch time by SubscriptionStatus
watch_time_share = users.groupby('SubscriptionStatus')['TotalWatchTime'].sum()

# Step 2: Plot pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    watch_time_share, 
    labels=watch_time_share.index, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=sns.color_palette('Set3')
)

plt.title('Share of Total Watch Time by Subscription Status')
plt.axis('equal') 
plt.tight_layout()
plt.show()

Most Popular Genres 

In [None]:
# Split genres on '|' and count frequency of each genre 
all_genres = movies['Genres'].str.split('|', expand=True).stack().reset_index(drop=True)
genre_counts = all_genres.value_counts().head(10)

# Convert to DataFrame for plotting
genre_df = genre_counts.reset_index()
genre_df.columns = ['Genre', 'Count']

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(
    data=genre_df,
    x='Genre',
    y='Count',
    hue='Genre',         
    palette='Set2',
    dodge=False,         
    legend=False         
)
plt.ylabel('Number of Movies')
plt.title('Most Popular Genres')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

User Distribution by Age Group

In [None]:
# Define age groups
bins = [0, 18, 25, 35, 50, 65, 100]
labels = ['<18', '18-24', '25-34', '35-49', '50-64', '65+']
users['AgeGroup'] = pd.cut(users['Age'], bins=bins, labels=labels, right=False)

# Create a grouped dataframe (no pivot/unstack yet)
plt.figure(figsize=(10, 6))
sns.countplot(
    data=users,
    x='AgeGroup',
    hue='SubscriptionStatus',
    palette='Set2'
)

plt.title('User Distribution by Age Group and Subscription Status')
plt.xlabel('Age Group')
plt.ylabel('Number of Users')
plt.legend(title='Subscription Status')
plt.tight_layout()
plt.show()

Subscription Status Distribution

In [None]:
# Step 1: Subscription counts
subscription_counts = users['SubscriptionStatus'].value_counts()

# Step 2: Plot pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    subscription_counts, 
    labels=subscription_counts.index, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=sns.color_palette('pastel')
)
plt.title('Distribution of Subscription Status')
plt.axis('equal')
plt.show()

User Distribution by Country

In [None]:
# Get top countries by user count
country_counts = users['Country'].value_counts().head(10).reset_index()
country_counts.columns = ['Country', 'Count']

# Plot with hue assigned to Country
plt.figure(figsize=(10, 5))
sns.barplot(
    data=country_counts,
    x='Country',
    y='Count',
    hue='Country',       
    palette='Set2',
    dodge=False,          
    legend=False          
)
plt.title('Top Countries by Number of Users')
plt.ylabel('Users')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Geographic Map (using Plotly) 
import plotly.io as pio

# Set renderer to browser
pio.renderers.default = "browser"

# Sample country count DataFrame (replace with your actual data)
country_counts = users['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'UserCount']

# Create choropleth
fig = px.choropleth(country_counts,
                    locations='Country',
                    locationmode='country names',
                    color='UserCount',
                    color_continuous_scale='blues',
                    title='User Distribution by Country')

# Show the plot
fig.show()

Device Usage Distribution

In [None]:
# Count devices and convert to DataFrame
device_counts = users['Device'].value_counts().reset_index()
device_counts.columns = ['Device', 'Count']

# Plot with hue assigned
plt.figure(figsize=(8, 5))
sns.barplot(
    data=device_counts,
    x='Device',
    y='Count',
    hue='Device',         
    palette='cool',
    dodge=False,         
    legend=False         
)
plt.title('Distribution of Device Usage')
plt.ylabel('Users')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Bonus

In [None]:
# Convert 'Year' to numeric, drop NaNs
movies['Year'] = pd.to_numeric(movies['Year'], errors='coerce')
movies = movies.dropna(subset=['Year'])

# Convert year to integer
movies['Year'] = movies['Year'].astype(int)

# Filter for movies from 1975 onwards
movies = movies[movies['Year'] >= 1975]

In [None]:
# Merge datasets
df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

# Filter to female users only — create a copy to avoid SettingWithCopyWarning
df_female = df[df['Gender'] == 'F'].copy()

# Split genres into separate rows
df_female['Genres'] = df_female['Genres'].str.split('|')
df_female = df_female.explode('Genres')

# Group by Year and Genre, count how many ratings by women
genre_trend = df_female.groupby(['Year', 'Genres']).size().reset_index(name='Count')

# Get top 5 genres overall for female users
top_genres = genre_trend.groupby('Genres')['Count'].sum().nlargest(5).index
filtered = genre_trend[genre_trend['Genres'].isin(top_genres)]

# Plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=filtered, x='Year', y='Count', hue='Genres', marker='o')
plt.title("Evolution of Genre Preferences Among Women (1975–Present)")
plt.xlabel("Year")
plt.ylabel("Number of Ratings by Female Users")
plt.legend(title='Genre')
plt.tight_layout()
plt.show()