In [None]:
# EDA Overview of Genre, Language, and Audio Trends

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets containing basic track metadata, audio features, and detected languages
tracks = pd.read_csv('../data/tracks_2020.csv')
audio = pd.read_csv('../data/audio_features.csv')
languages = pd.read_csv('../data/language_detected.csv')

# Merge datasets into a single DataFrame for unified analysis
# Merge track metadata with audio features based on track_id
df = pd.merge(tracks, audio, on='track_id', how='left')

# Add detected language for each track by merging on track_id again
df = pd.merge(df, languages[['track_id', 'language']], on='track_id', how='left')

# Convert release_date to datetime format and extract the release year
# This is useful for analyzing trends over time
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year

# Plot the number of tracks released per year
# Gives a basic sense of how the volume of music releases has changed over time
plt.figure(figsize=(10, 4))
sns.histplot(df['year'].dropna(), bins=20, kde=False)
plt.title("Track Count by Year")
plt.xlabel("Year")
plt.ylabel("Number of Tracks")
plt.tight_layout()
plt.show()

# Plot the top 10 most common languages detected in the dataset
# Helps understand the global language diversity of popular music
plt.figure(figsize=(8, 4))
sns.countplot(
    y='language',
    data=df,
    order=df['language'].value_counts().iloc[:10].index
)
plt.title("Top Detected Languages")
plt.tight_layout()
plt.show()