In [1]:
# Import Statements
import pandas as pd
import dash
from dash import dcc, html, Input, Output
import plotly.express as px

In [2]:
# Load and preprocess the dataset
dataframe_1 = pd.read_csv('songs_normalize.csv')

In [3]:
dataframe_filtered = dataframe_1.drop_duplicates() # Drop the 59 Duplicate rows

In [4]:
print(dataframe_1.dtypes) # No issue with data types

artist               object
song                 object
duration_ms           int64
explicit               bool
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object


In [5]:
dataframe_filtered.nunique() # Unique values

artist               835
song                1879
duration_ms         1793
explicit               2
year                  23
popularity            76
danceability         565
energy               580
key                   12
loudness            1671
mode                   2
speechiness          837
acousticness        1208
instrumentalness     772
liveness             783
valence              760
tempo               1831
genre                 59
dtype: int64

In [6]:
print(dataframe_filtered['genre'].nunique())  # There are 59 unique genres

59


In [7]:
dataframe_filtered = dataframe_filtered.copy()  # Work with a copy

dataframe_filtered.loc[:, 'genre'] = dataframe_filtered['genre'].replace('set()', 'Unknown')
dataframe_filtered.loc[:, 'new_genre'] = dataframe_filtered['genre'].apply(lambda x: x.split(',')[0].strip())

In [8]:
print(dataframe_filtered[['genre', 'new_genre']])

                 genre new_genre
0                  pop       pop
1            rock, pop      rock
2         pop, country       pop
3          rock, metal      rock
4                  pop       pop
...                ...       ...
1995               pop       pop
1996               pop       pop
1997  hip hop, country   hip hop
1998               pop       pop
1999           hip hop   hip hop

[1941 rows x 2 columns]


In [9]:
print(dataframe_filtered['new_genre'].nunique())
print(dataframe_filtered['new_genre'].value_counts())

12
new_genre
pop                  912
hip hop              749
rock                 155
Dance/Electronic      41
Unknown               22
latin                 15
R&B                   13
World/Traditional     10
country               10
metal                  9
Folk/Acoustic          4
easy listening         1
Name: count, dtype: int64


In [10]:
# Keywords to identify edited and featured versions
edited_keywords = ["Radio", "Version", "Remix", "Mix", "Remaster", "(Personal)", "(Interlude)", "Explicit", "Video", "Edit", "*", "from", "Official"]
feature_keywords = ["feat", "with", "Featuring", "Feat", "&", "Vs", " X"]

In [11]:
dataframe_filtered['is_edited'] = dataframe_filtered['song'].apply(lambda x: any(keyword.lower() in str(x).lower() for keyword in edited_keywords))
dataframe_filtered['is_featured'] = dataframe_filtered['song'].apply(lambda x: any(keyword.lower() in str(x).lower() for keyword in feature_keywords))

In [12]:
# Convert explicit to boolean
dataframe_filtered.loc[:, 'explicit'] = dataframe_filtered['explicit'].astype(str).str.upper().map({'TRUE': True, '1': True, 'FALSE': False, '0': False})

In [13]:
# Remove popularity scores of zero which is a missing data point
dataframe_filtered = dataframe_filtered.loc[dataframe_filtered['popularity'] != 0]

In [14]:
len(dataframe_filtered) # There a 1815 rows of remaining data

1815

In [15]:
count_featured = dataframe_filtered['is_featured'].sum()
print(f"Number of featured songs: {count_featured}")

count_edited = dataframe_filtered['is_edited'].sum()
print(f"Number of featured songs: {count_edited}")

Number of featured songs: 337
Number of featured songs: 165


#### Initialize Application

In [16]:
# Initialize the app
app = dash.Dash(__name__)
app.title = "Application"

#### App Layout

In [17]:
app.layout = html.Div([
    html.H1("Spotify Analysis of Top Songs from 1998 to 2020"), # Define the heading 
    
    html.Div([html.Label("Years to Analyze :"), dcc.RangeSlider(id = 'input_years', min = 1998, max = 2020, step = 1, value = [1998, 2020], marks = {year: str(year) for year in range( 1998, 2021, 1)}), # Input years
              html.Label("Select a Specific Genre to Analyze : "), dcc.Dropdown(id = 'input_genre', options = [{'label': genre_1, 'value': genre_1} for genre_1 in sorted(dataframe_filtered['new_genre'].dropna().unique())], multi = True, value = 'hip hop'), # Select genre
        html.Label("Choose Popularity Level [From 1 to 89] NOTHING ELSE. Songs with a popularity of Greater than or equal to this will be shown : "), dcc.Input(id = 'input_popularity', type = 'number', value = 10, min = 1, max = 89), # Select Popularity Level
        html.Div("Choose Explicit on or Off"), dcc.RadioItems(id = 'input_explicit', options = [{'label': 'All Songs', 'value': 'all'}, {'label': 'Profane Songs', 'value': 'explicit_yes'}, {'label': 'Non Profane Songs', 'value': 'explicit_none'}], value = 'all', inline = True)], style = {'padding': 10}),# Select a whether to have explicit content or not
    dcc.Graph(id='genre_pie_chart_id'), dcc.Graph(id = 'song_per_year_id'), dcc.Graph(id = 'popularity_histogram_id'), dcc.Graph(id = 'popularity_heatmap_id'), dcc.Graph(id = 'edited_vs_regular_id'), dcc.Graph(id = 'featured_vs_nonfeatured_id'), dcc.Graph(id = 'explicit_trend_id'), dcc.Graph(id = 'hover_data_id')])

In [18]:
global_variable_1 = 0 # Use a global variable to store the updated dataframe

#### Function to Update Graphs with callback

In [19]:
@app.callback([Output('genre_pie_chart_id', 'figure'), Output('song_per_year_id', 'figure'), Output('popularity_histogram_id', 'figure'), Output('popularity_heatmap_id', 'figure'), Output('edited_vs_regular_id', 'figure'), Output('featured_vs_nonfeatured_id', 'figure'), Output('explicit_trend_id', 'figure')], [Input('input_years', 'value'), Input('input_genre', 'value'), Input('input_popularity', 'value'), Input('input_explicit', 'value')]) # Link output graphs to input filters
def my_function(year_input_1, genre_input, popularity_input, explicit_input): # Function with given input

    dataframe_filtered_new = dataframe_filtered[(dataframe_filtered['year'] >= year_input_1[0]) & (dataframe_filtered['year'] <= year_input_1[1])] # Keeps year values within these ranges
    if genre_input: # When a genre is input
        if isinstance(genre_input, list):  # Check if genre_input is a list
            dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['genre'].isin(genre_input)]
        elif isinstance(genre_input, str):  # Check if it's a single string
            dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['genre'] == genre_input]# Check whether input is a string existing in the dataframe
    dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['popularity'] >= popularity_input] # Filters the frame with the given popularity or greater than it
    if explicit_input == 'explicit_yes': # When explicity is switched on
        dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['explicit'] == True] # Takes only explicit songs
    elif explicit_input == 'explicit_none': # When explicity is switched off
        dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['explicit'] == False] # Takes only non explicit songs
    else: # When the user does not have a preferance of profanity
        dataframe_filtered_new = dataframe_filtered_new[dataframe_filtered_new['explicit'].isin([True, False])] # Considers when both profane and non profane songs

    genre_counts_1 = dataframe_filtered_new['genre'].value_counts().reset_index() # Count number of songs per genre
    genre_counts_1.columns = ['genre', 'count']  # Change names of columns

    figure_pie_chart = px.pie(genre_counts_1, names = 'genre', values = 'count', title = "Plotly Pie chart representing the Distribution of Song Genres in Selected Years") # Creates a pie chart

    song_variable_1 = dataframe_filtered_new.groupby('year').size().reset_index(name = 'count') # The data is grouped according to year and count
    figure_songs_per_year = px.bar(song_variable_1, x = 'year', y = 'count', title = "Plotly bar chart of Number of Songs Released Per Year") # Creates a bar chart

    figure_histogram = px.histogram(dataframe_filtered_new, x = 'popularity', title = "Plotly Histogram plot of Popularity Distribution of Songs") # Let plotly decide the number of bins

    figure_heatmap = px.density_heatmap(dataframe_filtered_new, x = 'year', y = 'popularity', z = 'popularity',  title = "Plotly Heat map of Popularity of Songs accross years", color_continuous_scale = "Blues")

    a = dataframe_filtered_new['is_edited'].sum()
    b = f"Number of songs that have a Remix or Another Version : {a}"
    c = len(dataframe_filtered_new)
    d = f"Total Length of dataframe under these conditions : {c}"
    # Plotly box plot of Popularity of Song which are edited
    figure_edited_not_edited = px.box(data_frame = dataframe_filtered_new, x = 'is_edited', y = 'popularity', labels = {'is_edited': f'{b} \n {d}'}, title = 'Plotly Box Plot of Popularity of Edited Songs vs Non Edited Songs Eg - "Remix", "Mix", "Remaster"') 

    e = dataframe_filtered_new['is_featured'].sum()
    f_1 = f"Number of songs that have a Featuring Artist : {e}"
    g = len(dataframe_filtered_new)
    h = f"Total Length of dataframe under these conditions: {g}"
    # Plotly box plot of Popularity of Song with featured artists
    figure_featured_non_feautured = px.box(data_frame = dataframe_filtered_new, x = 'is_featured', y = 'popularity', labels = {'is_featured': f'{f_1} \n {h}'}, title = 'Plotly Box Plot of Popularity of Songs with a Feautured Artist Eg - "feat", "with", "Featuring", "&"')

    # Popularity of Explicit vs. Non Explicit songs
    dataframe_filtered_new_new = dataframe_filtered_new.copy()
    i = dataframe_filtered_new['explicit'].sum()
    j = f"NUMBER OF PROFANE SONGS : {i}"    
    k = len(dataframe_filtered_new) - i
    l = f"NUMBER OF NON PROFANE SONGS : {k}"
    dataframe_filtered_new_new['Explicit'] = dataframe_filtered_new_new['explicit'].map({True: 'Explicit', False: 'Non-Explicit'}) # Convert to a easily readable format
    figure_explcit_non_explicit = px.line(dataframe_filtered_new_new.groupby(['year', 'Explicit'])['popularity'].mean().reset_index(), x = 'year', y = 'popularity', color = 'Explicit', markers = True, title = f'Plotly Scatter Plot of Profane vs Non-Profance Songs Popularity over the years {j}, {l}')
    
    global global_variable_1 
    global_variable_1 = dataframe_filtered_new_new

    return figure_pie_chart, figure_songs_per_year, figure_histogram, figure_heatmap, figure_edited_not_edited, figure_featured_non_feautured, figure_explcit_non_explicit # Return the figures


#### Function with a hover definition

In [20]:
@app.callback(Output("hover_data_id", "figure"), Input("explicit_trend_id", "hoverData"))
def update_hover_plot(hover_data_input):

    if hover_data_input is None:
        return px.scatter(title = "Hover over a data point to see details in the above graph Profane vs. Non Profane Songs Popularity")
    
    if "points" not in hover_data_input:
        return px.scatter(title = "Hover over a data point to see details in the above graph Profane vs. Non Profane Songs Popularity")

    global global_variable_1
    year_data = hover_data_input['points'][0]['x']  # Gets the year to the x variable
    dataframe_filtered_new_1 = global_variable_1[global_variable_1['year'] == year_data] # Gets data with given year

    figure_bar_chart_hover_1 = px.bar(data_frame = dataframe_filtered_new_1, x = 'duration_ms', y = 'popularity', color = 'explicit', title = f"Plotly Bar Chart of Duration (in milliseconds) of a single song vs. Popularity over the years color coded by Profanity in the year : {year_data}") # Creates a bar chart

    return figure_bar_chart_hover_1

#### Run the dash application on a specific port

In [21]:
if __name__ == '__main__': # Run when run as the main program
    app.run(debug = True, port = 8054)  # Run on Port 8054