# Songs Dataset Analysis (2000-2020)

This project analyzes the "Songs Dataset 2000-2020" from Kaggle to visualize and understand music trends over the past two decades.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import kagglehub

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

## Data Loading

Loading the songs dataset from Kaggle.

In [None]:
# Download dataset
path = kagglehub.dataset_download("waqi786/songs-dataset-2000-2020-50k-records")
print(f"Dataset downloaded to: {path}")

# List available files
import os
for file in os.listdir(path):
    print(f"- {file}")

# Load the dataset
df = pd.read_csv(f"{path}/data.csv")
df.head()

## Top 20 Songs by Popularity

Visualization of the most popular songs based on popularity scores.

In [None]:
# Create figures directory if it doesn't exist
os.makedirs("songs_figures", exist_ok=True)

# Plot top 20 popular songs
plt.figure(figsize=(14, 10))
top_songs = df.sort_values("popularity", ascending=False).head(20)
sns.barplot(x="popularity", y="name", data=top_songs)
plt.title("Top 20 Most Popular Songs (2000-2020)", fontsize=16)
plt.tight_layout()
plt.savefig("songs_figures/top20_popular_songs.png")
plt.show()

## Song Duration vs. Popularity

Analysis of how the length of a song affects its popularity.

In [None]:
# Plot duration vs popularity analysis
plt.figure(figsize=(10, 8))
# Convert duration_ms to minutes
df["duration_min"] = df["duration_ms"] / 60000
sns.scatterplot(x="duration_min", y="popularity", data=df, alpha=0.5)
# Add trend line
sns.regplot(x="duration_min", y="popularity", data=df, scatter=False, color="red")
plt.title("Relationship Between Song Duration and Popularity", fontsize=16)
plt.xlabel("Duration (minutes)")
plt.ylabel("Popularity Score")
# Calculate correlation
correlation = stats.pearsonr(df["duration_min"], df["popularity"])
plt.annotate(f"Correlation: {correlation[0]:.2f} (p={correlation[1]:.4f})", 
             xy=(0.05, 0.95), xycoords="axes fraction")
plt.tight_layout()
plt.savefig("songs_figures/duration_vs_popularity.png")
plt.show()

## Popularity Distribution by Genre

Comparison of the popularity of different music genres.

In [None]:
# Plot genre popularity analysis
genre_pop = df.groupby("genre")["popularity"].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(12, 8))
sns.barplot(x="popularity", y="genre", data=genre_pop)
plt.title("Average Popularity by Music Genre", fontsize=16)
plt.tight_layout()
plt.savefig("songs_figures/genre_popularity.png")
plt.show()

## Genre Distribution

Exploration of the distribution of music genres in the dataset.

In [None]:
# Plot genre distribution
genre_counts = df["genre"].value_counts()
plt.figure(figsize=(10, 10))
plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Distribution of Music Genres", fontsize=16)
plt.axis('equal')
plt.tight_layout()
plt.savefig("songs_figures/genre_distribution.png")
plt.show()

## Trend of Song Characteristics Over Time

Examination of how song attributes have evolved over the years.

In [None]:
# Plot song characteristics trends over time
# Group by year and calculate averages of characteristics
yearly_attrs = df.groupby("year")[["danceability", "energy", "acousticness", "tempo", "duration_ms"]].mean()
# Convert duration to minutes
yearly_attrs["duration_min"] = yearly_attrs["duration_ms"] / 60000
yearly_attrs = yearly_attrs.drop("duration_ms", axis=1)

# Plot the trends
plt.figure(figsize=(14, 10))
for column in yearly_attrs.columns:
    # Normalize to 0-1 scale for better comparison
    if column != "duration_min" and column != "tempo":
        plt.plot(yearly_attrs.index, yearly_attrs[column], marker='o', label=column)
    elif column == "duration_min":
        # Plot duration on a secondary y-axis
        ax2 = plt.twinx()
        ax2.plot(yearly_attrs.index, yearly_attrs[column], marker='s', color='black', linestyle='--', label='Duration (min)')
        ax2.set_ylabel('Duration (minutes)')

plt.title("Trends in Song Characteristics (2000-2020)", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Normalized Attribute Value")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("songs_figures/song_trends_over_time.png")
plt.show()

## Danceability vs. Energy

Analysis of the relationship between a song's danceability and energy levels.

In [None]:
# Plot danceability vs energy analysis
plt.figure(figsize=(10, 8))
sns.scatterplot(x="danceability", y="energy", data=df, alpha=0.5, hue="genre")
plt.title("Relationship Between Danceability and Energy by Genre", fontsize=16)
plt.tight_layout()
plt.savefig("songs_figures/danceability_vs_energy.png")
plt.show()

## Key Findings

- Pop songs by artists like Ed Sheeran and The Weeknd dominate in terms of popularity
- There is a slight negative correlation between song duration and popularity, suggesting shorter songs may perform better
- Pop music is the most dominant genre both in quantity and average popularity
- Over time, song durations have generally decreased, reflecting changing listening habits in the streaming era
- Most successful songs achieve a balance between danceability and energy attributes
- Streaming numbers generally correlate with popularity scores, highlighting the importance of streaming platforms