# 🎧 Spotify Music Data Analysis 

# Average Popularity of Music Genres

In [None]:
import pandas as pd
import plotly.express as px

dataset_path = r"C:\Users\Z-BLOCK\Downloads\archive (17).zip"
df = pd.read_csv(dataset_path)

# Group by genre and calculate average popularity
genre_popularity = df.groupby("track_genre")["popularity"].mean().reset_index()
genre_popularity = genre_popularity.sort_values(by="popularity", ascending=False)

# interactive bar chart
fig = px.bar(
    genre_popularity,
    x='track_genre',
    y='popularity',
    title='Average Popularity of Music Genres',
    labels={'track_genre': 'Genre', 'popularity': 'Average Popularity'},
    color='popularity',
    color_continuous_scale='Blues'
)

fig.update_layout(xaxis_tickangle=90, xaxis_title='Music Genre', yaxis_title='Popularity')

fig.show()

# Top 10 Most Popular Artists Based on Average Track Popularity

In [None]:
# Grouping by artist and calculating the average popularity for each artist
artist_popularity = df.groupby('artists')['popularity'].mean().reset_index()

# Sorting artists by average popularity in descending order
artist_popularity_sorted = artist_popularity.sort_values(by='popularity', ascending=False)

# top 10 most popular artists based on average track popularity
print(artist_popularity_sorted.head(10))

fig = px.bar(artist_popularity_sorted.head(10), 
             x='artists', 
             y='popularity', 
             title="Top 10 Most Popular Artists Based on Average Track Popularity",
             labels={'artists': 'Artist', 'popularity': 'Average Popularity'},
             color='popularity',
             color_continuous_scale='Viridis')



fig.show()


# Do more danceable songs tend to be more popular?

In [None]:
fig = px.scatter(df, 
                 x='danceability', 
                 y='popularity', 
                 title='Danceability vs Popularity',
                 labels={'danceability': 'Danceability', 'popularity': 'Popularity'},
                 color='popularity',
                 color_continuous_scale='Viridis',
                 hover_data=['artists', 'track_name'])  

fig.show()


# Actual vs Predicted Song Popularity

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Data Preprocessing: Dropping rows with missing values (if any)
df = df.dropna(subset=['popularity', 'danceability', 'energy', 'valence', 'loudness', 'tempo'])

features = ['danceability', 'energy', 'valence', 'loudness', 'tempo']
target = 'popularity'

X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict the popularity on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Visualize the actual vs predicted popularity 
fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Popularity', 'y': 'Predicted Popularity'},
                 title="Actual vs Predicted Popularity")
fig.show()


# Classification models to categorize songs into different genres

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Dropping rows with missing values
df = df.dropna(subset=['track_genre', 'danceability', 'energy', 'valence', 'loudness', 'tempo'])

# Selecting relevant features and target
features = ['danceability', 'energy', 'valence', 'loudness', 'tempo']
target = 'track_genre'

X = df[features]
y = df[target]

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the Random Forest model 
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict the genres on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualize the actual vs predicted genres
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
fig = px.scatter(df_results, x='Actual', y='Predicted', title="Actual vs Predicted Genres")
fig.show()


# Song recommendation system based on a track’s features

In [None]:
import faiss
import numpy as np

df = df.dropna(subset=['danceability', 'energy', 'valence', 'loudness', 'tempo'])

# Select features
features = ['danceability', 'energy', 'valence', 'loudness', 'tempo']
X = df[features].values  

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype('float32')  # FAISS requires float32

# FAISS index
d = X_scaled.shape[1]  
index = faiss.IndexFlatL2(d)  
index.add(X_scaled)  

# Function to recommend songs based on a given song index
def recommend_song(song_idx, num_recommendations=5):
    distances, indices = index.search(X_scaled[song_idx:song_idx+1], num_recommendations+1)
    recommended_song_indices = indices[0][1:] 
    return df.iloc[recommended_song_indices][['album_name', 'artists', 'track_genre']]

# Example
recommended_songs = recommend_song(song_idx=2, num_recommendations=10)
print("Recommended Songs:")
print(recommended_songs)
