In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
plt.style.use("fivethirtyeight")     

In [None]:
df = pd.read_csv("spotify-2023.csv" ,encoding='ISO-8859-1')
df

# Data Cleaning :

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df["track_name"].duplicated().sum()

In [None]:
df.drop(columns=["key","in_spotify_playlists","in_apple_playlists","in_deezer_playlists","acousticness_%","speechiness_%","in_apple_charts","in_deezer_charts","in_shazam_charts"] , inplace=True)
df.drop_duplicates(subset=["track_name"] , inplace=True)
df.fillna(method="ffill",inplace=True)
df.reset_index(drop=True , inplace=True)

In [None]:
df.info()

In [None]:
df["streams"] = pd.to_numeric(df["streams"].str.extract(r"(\d+)")[0] , errors="coerce").astype("int") #Removing String and Converting into int

In [None]:
df["streams"] = df["streams"].abs() #Removing (-)

In [None]:
df.rename(columns={"artist(s)_name" : "artist"},inplace=True)

# Exploratory of Data :

In [None]:
df.columns

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.sample(3)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include="all")

# Visualization :

In [None]:
df = df.sort_values(by="streams" , ascending=False)
df.reset_index(inplace=True , drop=True)

# Column by column :

In [None]:
ax =sns.barplot(df["artist"].value_counts().head() , palette="plasma")

for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14)

plt.title("Artists Most Streamed All Time")
plt.ylabel("Music Count")
plt.xticks(rotation=90)
plt.show();

In [None]:
plt.plot(df["artist_count"].value_counts().index,df["artist_count"].value_counts().values)
plt.xlabel("Number of Artists")
plt.ylabel("Songs Count")

In [None]:
ax =sns.barplot(df["released_year"].value_counts().head() , palette="cool")

for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)

plt.title("Years Most Realsed in")
plt.xlabel("Released Year")
plt.ylabel("Music Count")
plt.show();


In [None]:
ax =sns.barplot(df["released_month"].value_counts().head() , palette="rainbow")

for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)

plt.title("Months Most Realsed in")
plt.xlabel("Released Month")
plt.ylabel("Music Count")
plt.show();


In [None]:
ax =sns.barplot(df["released_day"].value_counts().head() , palette="rainbow")

for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)

plt.title("Days Most Realsed in")
plt.xlabel("Released Day")
plt.ylabel("Music Count")
plt.show();


In [None]:
plt.figure(figsize=(12, 8))  
ax= sns.barplot(df['released_year'].value_counts().head(15), palette="viridis")

for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
    
# Step 3: Customize the plot
plt.title("Number of Songs Most Streamed Produced Each Year", fontsize=20)
plt.xlabel("Year", fontsize=16)
plt.ylabel("Number of Songs", fontsize=16)
plt.xticks(rotation=45)  # Rotate the x-axis labels if needed

# Display the plot
plt.show()

In [None]:
plt.figure(figsize=(12, 8))  
sns.histplot(df['released_year'], kde =True, palette="viridis")


    
# Step 3: Customize the plot
plt.title("Distribution of Number of Songs Most Streamed Produced Each Year", fontsize=20)
plt.xlabel("Year", fontsize=16)
plt.ylabel("Number of Songs", fontsize=16)
plt.xticks(rotation=45)  # Rotate the x-axis labels if needed

# Display the plot
plt.show()

In [None]:
df["mode"].value_counts().plot(kind="pie",autopct="%1.1f%%")

# Relationship between Columns :

In [None]:
df["artist"].value_counts().head(50).sum()

In [None]:
import matplotlib.ticker as mtick
sns.barplot(x=df['track_name'].head(10) , y=df['streams'] , data=df , palette="cool_r")
plt.title("Top 10 Tracks Most Streamed All The Time")
plt.xlabel("Track            ")
plt.ylabel("Streams")
plt.xticks(rotation=90)
plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
plt.show;

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(24, 24))

label_font_size = 19.5

# Plot 2023
df2023 = df[df["released_year"] == 2023].sort_values(by="streams", ascending=False)
sns.barplot(x=df2023['track_name'].head(10), y=df2023['streams'], ax=axs[0, 0], palette="rainbow")
axs[0, 0].set_title("Top 10 Tracks Most Streamed in 2023 \n \n", fontsize=28)
axs[0, 0].set_xlabel("Track \n", fontsize=25)
axs[0, 0].set_ylabel("Streams", fontsize=22)
axs[0, 0].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[0, 0].tick_params(axis='y', labelsize=label_font_size)
axs[0, 0].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2022
df2022 = df[df["released_year"] == 2022].sort_values(by="streams", ascending=False)
sns.barplot(x=df2022['track_name'].head(10), y=df2022['streams'], ax=axs[0, 1], palette="rainbow")
axs[0, 1].set_title("Top 10 Tracks Most Streamed in 2022 \n \n", fontsize=28)
axs[0, 1].set_xlabel("\n Track \n", fontsize=25)
axs[0, 1].set_ylabel("\n Streams", fontsize=22)
axs[0, 1].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[0, 1].tick_params(axis='y', labelsize=label_font_size)
axs[0, 1].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2021
df2021 = df[df["released_year"] == 2021].sort_values(by="streams", ascending=False)
sns.barplot(x=df2021['track_name'].head(10), y=df2021['streams'], ax=axs[1, 0], palette="rainbow")
axs[1, 0].set_title("\n Top 10 Tracks Most Streamed in 2021 \n", fontsize=28)
axs[1, 0].set_xlabel("Track \n", fontsize=25)
axs[1, 0].set_ylabel("Streams", fontsize=22)
axs[1, 0].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[1, 0].tick_params(axis='y', labelsize=label_font_size)
axs[1, 0].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2020
df2020 = df[df["released_year"] == 2020].sort_values(by="streams", ascending=False)
sns.barplot(x=df2020['track_name'].head(10), y=df2020['streams'], ax=axs[1, 1], palette="rainbow")
axs[1, 1].set_title("\n Top 10 Tracks Most Streamed in 2020 \n", fontsize=28)
axs[1, 1].set_xlabel("\n Track \n", fontsize=25)
axs[1, 1].set_ylabel("\n Streams", fontsize=22)
axs[1, 1].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[1, 1].tick_params(axis='y', labelsize=label_font_size)
axs[1, 1].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.tight_layout()
plt.show()


In [None]:
ax =sns.barplot(x=df['track_name'].head(10) , y=df['bpm'] , data=df , palette="flare")
for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
plt.title("BPM of Top 10 Tracks Most Streamed All The Time \n")
plt.xlabel("\n Track        ")
plt.ylabel("Beats Per Minute")
plt.xticks(rotation=90)
plt.show;

In [None]:
ax =sns.barplot(x=df['track_name'].head(10) , y=df['danceability_%'] , data=df , palette="turbo")
for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
plt.title("Danceability of Top 10 Tracks Most Streamed All The Time \n")
plt.xlabel("\n Track        ")
plt.ylabel("Danceability %")
plt.xticks(rotation=90)
plt.show;

In [None]:
ax =sns.barplot(x=df['track_name'].head(10) , y=df['valence_%'] , data=df , palette="cool_r")
for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
plt.title("Positivity of Top 10 Tracks Most Streamed All The Time \n")
plt.xlabel("\n Track        ")
plt.ylabel("Positivity %")
plt.xticks(rotation=90)
plt.show;

In [None]:
ax =sns.barplot(x=df['track_name'].head(10) , y=df['energy_%'] , data=df , palette="turbo")
for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
plt.title("Energy of Top 10 Tracks Most Streamed All The Time \n")
plt.xlabel("\n Track")
plt.ylabel("Energy %")
plt.xticks(rotation=90)
plt.show;

In [None]:
sns.barplot(x=df['track_name'].head(10) , y=df['instrumentalness_%'] , data=df , palette="coolwarm_r")
plt.title("Instrumentalness of Top 10 Tracks Most Streamed All The Time \n")
plt.xlabel("\n Track")
plt.ylabel("Amount of instrumental content %")
plt.xticks(rotation=90)
plt.show;

In [None]:
ax =sns.barplot(x=df['track_name'].head(10) , y=df['liveness_%'] , data=df , palette="cool")
for i in ax.containers:
    ax.bar_label(i, fmt='%d', label_type='edge', fontsize=14.5)
plt.title("Liveness of Top 10 Tracks Most Streamed All The Time\n")
plt.xlabel("\n Track")
plt.ylabel("Presence of live performance elements       ")
plt.xticks(rotation=90)
plt.show;

In [None]:
sns.barplot(x=df['artist'].head(10) , y=df['streams'] , data=df , palette="plasma")
plt.title("Top 10 Artists Most Streamed All The Time")
plt.xlabel("Artist            ")
plt.ylabel("Streams")
plt.xticks(rotation=90)
plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
plt.show;

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(24, 24))

label_font_size = 19.5

# Plot 2023
df2023 = df[df["released_year"] == 2023].sort_values(by="streams", ascending=False)
sns.barplot(x=df2023['artist'].head(10), y=df2023['streams'], ax=axs[0, 0], palette="cool_r")
axs[0, 0].set_title("Top 10 Artists Most Streamed in 2023 \n \n", fontsize=30)
axs[0, 0].set_xlabel("Artist \n", fontsize=28)
axs[0, 0].set_ylabel("Streams", fontsize=25)
axs[0, 0].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[0, 0].tick_params(axis='y', labelsize=label_font_size)
axs[0, 0].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2022
df2022 = df[df["released_year"] == 2022].sort_values(by="streams", ascending=False)
sns.barplot(x=df2022['artist'].head(10), y=df2022['streams'], ax=axs[0, 1], palette="cool_r")
axs[0, 1].set_title("Top 10 Artists Most Streamed in 2022 \n \n", fontsize=30)
axs[0, 1].set_xlabel("\n Artist \n", fontsize=28)
axs[0, 1].set_ylabel("\n Streams", fontsize=25)
axs[0, 1].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[0, 1].tick_params(axis='y', labelsize=label_font_size)
axs[0, 1].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2021
df2021 = df[df["released_year"] == 2021].sort_values(by="streams", ascending=False)
sns.barplot(x=df2021['artist'].head(10), y=df2021['streams'], ax=axs[1, 0], palette="cool_r")
axs[1, 0].set_title("\n Top 10 Artists Most Streamed in 2021 \n", fontsize=30)
axs[1, 0].set_xlabel("Artist \n", fontsize=28)
axs[1, 0].set_ylabel("Streams", fontsize=25)
axs[1, 0].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[1, 0].tick_params(axis='y', labelsize=label_font_size)
axs[1, 0].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Plot 2020
df2020 = df[df["released_year"] == 2020].sort_values(by="streams", ascending=False)
sns.barplot(x=df2020['artist'].head(10), y=df2020['streams'], ax=axs[1, 1], palette="cool_r")
axs[1, 1].set_title("\n Top 10 Artists Most Streamed in 2020 \n", fontsize=30)
axs[1, 1].set_xlabel("\n Artist \n", fontsize=28)
axs[1, 1].set_ylabel("\n Streams", fontsize=25)
axs[1, 1].tick_params(axis='x', rotation=90, labelsize=label_font_size)
axs[1, 1].tick_params(axis='y', labelsize=label_font_size)
axs[1, 1].yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(32.5,32.5))
numeric_df = df.select_dtypes([int,float])
corr = numeric_df.corr()
sns.heatmap(corr , annot =True)

# Summary:

In [None]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summ['Missing#'] = df.isna().sum()
    summ['Missing%'] = (df.isna().sum())/len(df)
    summ['Dups'] = df.duplicated().sum()
    summ['Uniques'] = df.nunique().values
    summ['Count'] = df.count().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['Min'] = desc['min'].values
    summ['Max'] = desc['max'].values
    summ['Average'] = desc['mean'].values
    summ['Standard Deviation'] = desc['std'].values
    summ['First Value'] = df.loc[0].values
    summ['Second Value'] = df.loc[1].values
    summ['Third Value'] = df.loc[2].values

    display(summ)

summary(df)