In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
from sklearn import datasets, linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

In [None]:
# Read in the source files and create music_df DataFrame
full_music_data = pd.read_csv("full_music_data.csv")
spotify_music_df = pd.read_csv("music_genre.csv")


In [None]:
spotify_music_df.head(10)


In [None]:
spotify_music_df.info()


# Distribution of Genres

In [None]:
fig = px.bar(spotify_music_df.groupby('genre').size())
fig.update_layout(title={'text': f"Distribution of Each Song's Genre", 'x': 0.5,
                             'xanchor': 'center', 'font': {'size': 20}}, yaxis_title="Count", showlegend=False)

## Insights drawn from correlation matrix

As seen from the bar chart above, the Pop/Rock genre is overwhelmingly the most dominant genre in the dataset


# Analyzing Correlation of audio features

In [None]:
colormap=plt.cm.RdBu
plt.figure(figsize=(16,14))
sns.heatmap(spotify_music_df.drop(columns=['song_id', 'artist_names', 'artists_id','release_date','song_title', 'genre', 'duration_ms']).corr(),linewidths=0.1,cmap= colormap, vmax=1.0, square = True, annot = True, center = 0)


## Insights drawn from correlation matrix

As seen from the correlation matrix above, popularity is strongly correlated with the year released. This understanding aligns with expected outcomes as Spotify generates its 'popularity' value not just by the number of streams or views a song receives, but also by how recent they are 

Acousticness, as expected has a strong negative correlation with energy since high energy songs are generally associated with electronic music and lower the acoustic/manual elements in a song, lower the energy levels of a song

Conversely, energy along similar lines has a strong positive correlation with loudness. This is expected as high energy songs tend to be loud 

# Defining Music Eras 

In [None]:
def year(df):
    if df>1920 and df<=1945:
        return "Post War"
    if df>1945 and df<=1970:
        return "Retro"
    if df>1970 and df<=1995:
        return "Millenial"
    else:
        return "Post-Modern"
full_music_data['era'] = full_music_data['year'].apply(year)
px.pie(data_frame = full_music_data, names = 'era', hole = 0.2, title = 'Music Eras')


## Insights drawn from Music Eras

As seen from the distribution of music created across the different eras of music, it can be observed that close to half of the total music created (based on available data) is in the millenial age - 1970 to 1995. It's also interesting to note that songs created in the post-war era from 1920 to 1945 is just ~3 % of the total music created till date. This can be explained based on the mood and priorities coming out of the World War and people likely prioritizing things other than music (as they probably should)


# Classifying degree of loudness of songs

In [None]:
def loud(row):
    median = full_music_data['loudness'].median()
    stdev = full_music_data['loudness'].std()
    if row['loudness'] >= median + (1.5 * stdev):
        return "Extreme"
    elif row['loudness'] >= median + (stdev):
        return "Very Loud"
    elif row['loudness'] >= median + (0.5 * stdev):
        return "Loud"
    elif row['loudness'] >= median - (0.5 * stdev):
        return "Soft"
    elif row['loudness'] >= median - (stdev):
        return "Very Soft"
    else:
        return "Mellow"
full_music_data['is_loud'] = full_music_data.apply(lambda row: loud(row), axis = 1)
px.pie(data_frame = full_music_data, names = 'is_loud', hole = 0.2, title = 'DEGREE of LOUDNESS')


## Insights drawn from Degree of Loudness

Based on the piechart above, it can be seen that nearly 40% of the songs are 'Soft' and less than 1% of the songs are 'Extreme'

# Classifying Popularity of songs

In [None]:
def func(df):
    if df > 75:
        return 'Very Popular'
    elif df > 50 and df < 76:
        return 'Popular'
    elif df > 25 and df < 51:
        return 'Neither Popular nor Unpopular'
    else:
        return 'Unpopular'
full_music_data['isPopular'] = full_music_data['popularity'].apply(func)
px.pie(data_frame = full_music_data, names = 'isPopular', hole = 0.2, title = 'POPULARITY')


## Insights drawn from Degree of Loudness

Based on the piechart above, it can be observed that about half the songs are neither popular nor unpopular, with about a fifth of the total songs being classified as popular, with nearly 28% of the songs having a low popularity score


In [None]:
# Plot histogram to show distribution of songs based on energy levels and heatmap of songs based on valence and danceability

x = "danceability"
y = "valence"

fig, (ax1, ax2) = plt.subplots(1, 2, sharey = False, sharex = False, figsize = (16, 10))
fig.suptitle("Histograms")
h = ax2.hist2d(spotify_music_df[x], spotify_music_df[y], bins = 20)
ax1.hist(spotify_music_df["energy"])

ax2.set_xlabel(x)
ax2.set_ylabel(y)

ax1.set_xlabel("energy")

plt.colorbar(h[3], ax = ax2)

plt.show()


### The Spotify Web API developer guide defines them as follows: 

Danceability: Describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. 

Valence: Describes the musical positiveness conveyed by a track

The histogram on the left illustrates the distribution of songs based on their energy levels, and it can be seen that it follows a fairly normal distribution but is slightly skewed to the right with the largest number of songs having an energy value between 0.4 to 0.6. 

The illustration on the right indicates a heatmap showing how danceability varies based on valence values of songs. It can be seen that most number of songs can be found to have a valence and danceability value of ~0.6.

## Principal Component Analysis

In [None]:
selected_attributes = ["energy", "liveness", "tempo", "valence", "loudness", "speechiness", "acousticness", "danceability", "instrumentalness"]
display_text_1 = spotify_music_df["artist_names"] + " - " + spotify_music_df["song_title"]
display_text_2 = display_text_1.values

# X = data_frame.drop(droppable, axis=1).values
X = spotify_music_df[selected_attributes].values
y = spotify_music_df["danceability"].values

min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)

pca = PCA(n_components=3)
pca.fit(X)

X = pca.transform(X)

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

trace = go.Scatter3d(x = X[:,0], y = X[:,1], z = X[:,2], text = display_text_2, mode = "markers", marker = dict(size = 8, color = y))

fig = go.Figure(data = [trace])
py.iplot(fig, filename = "PCA-plot")


### Insights drawn from the PCA

Based on the 3d plot above, the lesser the distance between any two songs, the larger the similarities in their traits.