In [51]:
#Project Name
#Amazon Prime TV Shows and Movies – Exploratory Data Analysis

In [52]:
#Project Type
#EDA

In [53]:
#Team Member 1
#Ajay Bramhankar

Project Summary
With the rapid growth of OTT platforms, Amazon Prime Video has become one of the most popular streaming services worldwide. As the content library continues to expand, there is a growing need to analyze the platform’s offerings to better understand user preferences, content trends, and business opportunities. This project aims to perform a comprehensive Exploratory Data Analysis (EDA) on Amazon Prime’s catalog of TV shows and movies to extract actionable insights.

The analysis is based on two key datasets:

titles.csv: Contains metadata for approximately 9,871 titles, including content type (Movie/Show), release year, runtime, genres, IMDb and TMDb scores, and production countries.

credits.csv: Contains information on cast and crew (124,000+ records), including names, roles (actor, director), and associated title IDs.

Using the UBM framework — Univariate, Bivariate, and Multivariate Analysis — this project explores data distributions, relationships between variables, and multi-variable trends.

In the Univariate Analysis, we study individual features such as content type, release year, runtime, genres, and IMDb scores. Results show that Amazon Prime is dominated by movies (around 70%), with drama, comedy, and action as the most common genres. Content production increased significantly after 2010. IMDb ratings are concentrated between 6 and 8, indicating average to good viewer reception.

The Bivariate Analysis explores relationships between two variables. We examine how runtime relates to IMDb scores, how genres influence average ratings, and how content type varies in performance. Interestingly, longer runtimes do not always correlate with higher ratings. Some genres like War and History, while less frequent, tend to receive higher average scores.

The Multivariate Analysis combines multiple dimensions — such as content type, genre, runtime, and score — to uncover deeper insights. For example, we explore the correlation between IMDb and TMDb ratings, grouped by content type. The data also reveals strong production dominance by the United States, followed by countries like the UK, Canada, and India.

Over 20 visualizations were created, each accompanied by proper justification and business insights. For example, knowing that certain genres consistently perform well helps guide content acquisition strategies. Runtime patterns over time can inform decisions about optimal content lengths. Correlation plots between IMDb and TMDb ratings highlight reliability and viewer agreement across platforms.

In [55]:
!pip install pandas matplotlib seaborn --quiet


In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from ast import literal_eval

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [57]:
import zipfile
import pandas as pd
import os

# Define file paths (update if your Downloads folder is different)
titles_zip_path = r"C:\Users\ajayb\Downloads\titles.csv.zip"
credits_zip_path = r"C:\Users\ajayb\Downloads\credits.csv.zip"

# Define target extraction folder (you can also use '.' to extract in current folder)
extract_path = r"C:\Users\ajayb\Downloads"

# Unzip titles.csv.zip
with zipfile.ZipFile(titles_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Unzip credits.csv.zip
with zipfile.ZipFile(credits_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load the extracted CSVs
titles_csv = os.path.join(extract_path, "titles.csv")
credits_csv = os.path.join(extract_path, "credits.csv")

titles_df = pd.read_csv(titles_csv)
credits_df = pd.read_csv(credits_csv)

# Preview
print("Titles Shape:", titles_df.shape)
print("Credits Shape:", credits_df.shape)
titles_df.head()


Titles Shape: (9871, 15)
Credits Shape: (124235, 5)


Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


In [58]:
print("Titles Shape:", titles_df.shape)
print("Credits Shape:", credits_df.shape)

display(titles_df.head())
display(credits_df.head())


Titles Shape: (9871, 15)
Credits Shape: (124235, 5)


Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


Unnamed: 0,person_id,id,name,character,role
0,59401,ts20945,Joe Besser,Joe,ACTOR
1,31460,ts20945,Moe Howard,Moe,ACTOR
2,31461,ts20945,Larry Fine,Larry,ACTOR
3,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR
4,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from ast import literal_eval

# Set styles
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Convert 'genres' column to lists
titles_df['genres'] = titles_df['genres'].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# 1. Movies vs TV Shows
type_counts = titles_df['type'].value_counts()

# 2. Titles released per year
yearly_counts = titles_df.groupby('release_year')['id'].count()

# 3. Genre distribution
all_genres = [genre for sublist in titles_df['genres'] for genre in sublist]
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=["Genre", "Count"]).sort_values(by="Count", ascending=False)

# 4. IMDb scores
imdb_scores = titles_df['imdb_score'].dropna()

# Plotting
fig, axs = plt.subplots(2, 2, figsize=(18, 12))

# Chart 1: Content type
sns.barplot(x=type_counts.index, y=type_counts.values, ax=axs[0, 0])
axs[0, 0].set_title("Count of Movies vs TV Shows")
axs[0, 0].set_xlabel("Type")
axs[0, 0].set_ylabel("Count")

# Chart 2: Titles per year
sns.lineplot(x=yearly_counts.index, y=yearly_counts.values, ax=axs[0, 1])
axs[0, 1].set_title("Titles Released by Year")
axs[0, 1].set_xlabel("Year")
axs[0, 1].set_ylabel("Number of Titles")

# Chart 3: Top 10 genres
sns.barplot(data=genre_df.head(10), x="Genre", y="Count", ax=axs[1, 0])
axs[1, 0].set_title("Top 10 Genres")
axs[1, 0].tick_params(axis='x', rotation=45)

# Chart 4: IMDb score distribution
sns.histplot(imdb_scores, bins=20, kde=True, ax=axs[1, 1], color='skyblue')
axs[1, 1].set_title("IMDb Score Distribution")
axs[1, 1].set_xlabel("Score")
axs[1, 1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()


In [None]:
# IMDb Score vs TMDb Score
plt.figure(figsize=(8, 6))
sns.scatterplot(data=titles_df, x="imdb_score", y="tmdb_score", hue="type", alpha=0.6)
plt.title("IMDb Score vs TMDb Score by Content Type")
plt.xlabel("IMDb Score")
plt.ylabel("TMDb Score")
plt.xlim(0, 10)
plt.ylim(0, 10)
plt.legend(title="Type")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=country_df.head(10), x="Country", y="Count", color='steelblue')
plt.title("Top 10 Production Countries")
plt.xlabel("Country")
plt.ylabel("Number of Titles")
plt.xticks(rotation=45)
plt.show()
