# Analysis by year (2014 - 2024)

In [2]:
import pandas as pd
import plotly.express as px

df_year = pd.read_csv(r'./Dataset/big_data2.csv', encoding='latin1')
df_year = df_year.drop(columns=['Unnamed: 0'])

# remove rows with unknown value
df_year = df_year[~df_year[['song', 'artist', 'album']].isin(['Unknown Song', 'Unknown Artist', 'Unknown Album']).any(axis=1)]

#convert miliseconds to minutes
df_year['duration_minutes'] = df_year['duration'] / 60000

# convert date column to date time format
df_year['date'] = pd.to_datetime(df_year['date'], dayfirst=True, errors='coerce')

#extract years from 'date' column
df_year['year'] = df_year['date'].dt.year

df_year.head()

Unnamed: 0,duration,country,song,artist,album,date,time,duration_minutes,year
0,29381,AU,Up and Away,Envy,The Magic Soup And The Bittersweet Faces,2014-08-26,11:22:33,0.489683,2014
1,13458,AU,Keep It Down Low,Envy,The Magic Soup And The Bittersweet Faces,2014-08-26,11:22:47,0.2243,2014
2,3436,AU,No Love In The Club,Envy,The Magic Soup And The Bittersweet Faces,2014-08-26,11:22:49,0.057267,2014
3,247520,AU,Am I Wrong,Nico & Vinz,Am I Wrong,2014-08-26,11:26:56,4.125333,2014
4,177352,AU,So Everything,Envy,The Magic Soup And The Bittersweet Faces,2014-08-26,11:29:54,2.955867,2014


In [3]:
df_year.columns

Index(['duration', 'country', 'song', 'artist', 'album', 'date', 'time',
       'duration_minutes', 'year'],
      dtype='object')

In [4]:
#group by years and calculate summary statistics
yearly_summary = df_year.groupby('year').agg(
    total_songs =('song', 'count'),
    total_minutes=('duration_minutes', 'sum'),
    avg_duration_per_song=('duration_minutes', 'mean')
).reset_index()

yearly_summary

Unnamed: 0,year,total_songs,total_minutes,avg_duration_per_song
0,2014,7026,22971.571683,3.269509
1,2015,9665,28320.23265,2.930184
2,2016,9025,28434.38085,3.150624
3,2017,9336,25305.800717,2.710561
4,2018,4316,10417.60345,2.413717
5,2019,2943,8757.580333,2.975732
6,2020,2774,7305.782533,2.633663
7,2021,1216,3092.199517,2.542927
8,2022,7492,14586.241167,1.946909
9,2023,16487,36117.10455,2.190641


In [5]:
# line plot
import plotly.express as px

# fig = px.line(
#     yearly_summary,
#     x='year',
#     y='total_songs',
#     title='Total Songs played by Year',
#     labels={'total_songs': 'Total Songs'}
# )

# fig.update_layout(
#     xaxis_title='year',
#     yaxis_title='Number of Songs',
#     xaxis=dict(
#         tickmode='linear', 
#         tick0=2014,
#         dtick=1,
#         range=[2014, 2024]
#     ))

# fig.show()

# bar chart
fig = px.bar(
    yearly_summary, 
    x='year',
    y='total_songs',
    title='Total Songs Played by Year',
    labels={'total_songs': 'Total Songs'}
)

fig.update_layout(
    xaxis_title='year',
    yaxis_title='Number of Songs',
    xaxis=dict(
        tickmode='linear', 
        tick0=2014,
        dtick=1,
        range=[2014, 2024]
))

fig.show()

In [6]:
# total duration per year
fig = px.line(
    yearly_summary,
    x='year', 
    y='total_minutes',
    title='Total Duration played by Year (minutes)',
    labels={'total_minutes': 'Total Duration (minutes)'}
)

fig.update_layout(
    xaxis_title='Year', yaxis_title='Total Duration (Minutes)',
    xaxis=dict(
        tickmode='linear', 
        tick0=2014,
        dtick=1,
        range=[2014, 2024]
))

fig.show()

In [7]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots: two rows, one column
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Total Songs Played by Year', 'Total Duration Played by Year (minutes)'),
    shared_xaxes=True
)

# Bar chart for total songs
bar_trace = go.Bar(
    x=yearly_summary['year'],
    y=yearly_summary['total_songs'],
    name='Total Songs',
    marker=dict(color='pink')
)

# Line chart for total duration
line_trace = go.Scatter(
    x=yearly_summary['year'],
    y=yearly_summary['total_minutes'],
    mode='lines',
    name='Total Duration (minutes)'
)

# Add traces to the subplots
fig.add_trace(bar_trace, row=1, col=1)
fig.add_trace(line_trace, row=2, col=1)

# Update layout for the combined figure
fig.update_layout(
    title_text='Yearly Summary of Songs and Duration',
    xaxis_title='Year',
    yaxis_title='Total Songs',
    xaxis2_title='Year',
    yaxis2_title='Total Duration (Minutes)',
    xaxis=dict(
        tickmode='linear', 
        tick0=2014,
        dtick=1,
        range=[2014, 2024],
        showgrid=True,
        gridcolor='gray'
    ),
    yaxis=dict(
        title='Number of Songs',
        showgrid=True,
        gridcolor='gray'
    ),
    yaxis2=dict(
        title='Total Duration (Minutes)',
        showgrid=True,
        gridcolor='gray'
    ),
    plot_bgcolor='rgb(30,30,30)',
    paper_bgcolor='rgb(30,30,30)',
    template='plotly_dark',
    showlegend=False
)

fig.write_html("./charts/section2-duration.html")

fig.show()


In [8]:
# top 15 artist per year
top_artist_per_year = df_year.groupby(['year', 'artist']).size().reset_index(name='song_count')
top_artist_per_year = top_artist_per_year.sort_values(['year', 'song_count'], ascending=[True, False])

top_artist_per_year = top_artist_per_year.groupby('year').head(15).reset_index(drop=True)

top_artist_per_year.head()

Unnamed: 0,year,artist,song_count
0,2014,Yuna,860
1,2014,Beyonc√©,673
2,2014,Ed Sheeran,553
3,2014,Norah Jones,274
4,2014,Destiny's Child,248


**Extract and Store the top15 artist for each year in csv file**

In [9]:
years = range(2014, 2025)

filtered_artist_DF = []

for year in years:
    filtered_artist = top_artist_per_year[top_artist_per_year['year'] == year]    
    filtered_artist_DF.append(filtered_artist)
    
combined_artist_year = pd.concat(filtered_artist_DF)

# combined_artist_year.to_csv('top_artist_2014_2024.csv', index=False)

In [10]:
# fig = px.bar(
#     top_artist_per_year, 
#     x='artist', y='song_count', color='year',
#     title = 'Top 15 artist per Year (2014 - 2024)', 
#     facet_col='year', 
#     labels={'song_count': 'Number of Songs Played'}
# )

# fig.update_layout(xaxis_title='Artist', yaxis_title='Number of Songs Played')
    
# fig.show()


In [11]:
# import dash 
# from dash import dcc, html
# import plotly.express as px
# import pandas as pd

# #create plotly figure
# def create_figure_artist(year):
#     filtered_df = top_artist_per_year[top_artist_per_year['year'] == year]
#     fig = px.bar(
#         filtered_df,
#         x='song_count', y='artist',
#         orientation='h',
#         title= f'Top Artist for {year}',
#         color='artist',
#         labels={'song_count': 'song count'}
#     )
    
#     fig.update_layout(
#         xaxis_title='Song Count', yaxis_title='Artist', showlegend=False,
#         title={
#             'text': f'Top Artist for {year}', 
#             'x': 0.5,
#             'xanchor':'center', 
#             'font': {
#                 'size': 30, 
#                 'weight': 'bold',
#                 'color': 'pink'
#             }},
#             plot_bgcolor='rgb(30, 30, 30)', 
#             paper_bgcolor='rgb(30, 30, 30)',
#             font={'color': 'white'},
#             xaxis=dict(
#                 tickfont={'color': 'white'},
#             ),
#             yaxis=dict(
#                 tickfont={'color': 'white'},
#             ),
#         )
    
#     return fig

# #initialize dash app
# app = dash.Dash(__name__)

# app.layout = html.Div([
#     #outer container that holds both te dropdown and the chart
#     html.Div([
#         html.Div([
#             html.Label('Select Year', style={'color': 'pink', 'fontSize': 18, 'textAlign': 'center'}),  # Label for dropdown
#             dcc.Dropdown(
#                 id='year-dropdown',
#                 options=[{'label': year, 'value': year} for year in top_artist_per_year['year'].unique()], 
#                 value=2014,
#                 style={'backgroundColor': 'white', 'color': 'black', 'width': '150px'},  
#                 clearable=False
#             ),
#         ], style={'display': 'flex', 'justifyContent': 'center', 'alignItems': 'center', 'flexDirection':'column', 'marginBottom': '20px'}),
        
#         #chart container
#         dcc.Graph(id='bar-chart')
#     ], style={'width': '100%', 'display': 'flex', 'flexDirection':'column', 'alignItems': 'center'})    
# ])

# @app.callback(
#     dash.Output('bar-chart', 'figure'),
#     [dash.Input('year-dropdown', 'value')]
# )

# def update_chart_artist (selected_year):
#     return create_figure_artist(selected_year)

# if __name__ == '__main__':
#     app.run_server(debug=True, port=8060)
    
# # default browser: http://127.0.0.1:8060

In [12]:
# top 15 songs per year
top_songs_per_year = df_year.groupby(['year', 'song', 'artist']).size().reset_index(name='play_count')
top_songs_per_year = top_songs_per_year.sort_values(['year', 'play_count'], ascending=[True, False])

top_songs_per_year = top_songs_per_year.groupby('year').head(15).reset_index(drop=True)

top_songs_per_year.head()

Unnamed: 0,year,song,artist,play_count
0,2014,Thinking out Loud,Ed Sheeran,473
1,2014,Come Away With Me,Norah Jones,242
2,2014,Happy Little Pill,Troye Sivan,242
3,2014,Superheroes,The Script,232
4,2014,How Long Will I Love You - Bonus Track,Ellie Goulding,189


**Extract and Store the top 15 songs for each year in csv file**

In [13]:
years = range(2014, 2025)

filtered_song_DF = []

for year in years:
    filtered_song = top_songs_per_year[top_songs_per_year['year'] == year]    
    filtered_song_DF.append(filtered_song)
    
combined_song_year = pd.concat(filtered_song_DF)

# combined_song_year.to_csv('top_song_2014_2024.csv', index=False)

In [14]:
import dash 
from dash import dcc, html
import plotly.express as px
import pandas as pd

#create plotly figure
def create_figure_song (year):
    filtered_df = top_songs_per_year[top_songs_per_year['year'] == year]
    fig = px.bar(
        filtered_df,
        x='play_count', y='song',
        orientation='h',
        color='song',
        title= f'Top Songs for {year}',
        labels={'song_count': 'Number of Times Songs Played'},
        hover_data={'artist': True, 'play_count': True},
        text='artist'
    )
    
    fig.update_layout(
        xaxis_title='Song count', yaxis_title='Song', showlegend=False,
        title={
            'text': f'Top Songs for {year}', 
            'x': 0.5,
            'xanchor':'center', 
            'font': {
                'size': 30, 
                'weight': 'bold'}
            })
    
    #update text font size
    fig.update_traces(textfont_color='white')
    
    return fig

#initialize dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Dropdown(
        id='year-dropdown',
        options=[{'label': year, 'value': year} for year in top_songs_per_year['year'].unique()], 
        value=2014
    ),
    dcc.Graph(id='bar-chart')
])

@app.callback(
    dash.Output('bar-chart', 'figure'),
    [dash.Input('year-dropdown', 'value')]
)

def update_chart_song(selected_year):
    return create_figure_song(selected_year)

if __name__ == '__main__':
    app.run_server(debug=True, port=8051)

In [34]:
# summary statistics by country

country_summary = df_year.groupby('country').agg(
   total_songs=('song', 'count'),
   total_minutes=('duration_minutes', 'sum'),
   distinct_artists=('artist', 'nunique'),
   distinct_songs=('song', 'unique') 
).reset_index()

# create a mapping of ISO alpha-2 codes to country names
iso_to_country = {
    'MY': 'Malaysia',
    'AU': 'Australia',
    'ID': 'Indonesia',
    'TH': 'Thailand',
    'AE': 'UAE',
    'FR': 'France',
    'TR': 'Turkey',
    'DE': 'Germany',
    'IT': 'Italy',
    'CZ': 'Czechia',
    'HU': 'Hungary',
    'IN': 'India',
    'BG': 'Bulgaria',
    'RO': 'Romania',
    'SG': 'Singapore'
}

country_summary['country_name'] = country_summary['country'].map(iso_to_country)

country_summary


Unnamed: 0,country,total_songs,total_minutes,distinct_artists,distinct_songs,country_name
0,AE,609,1522.718533,144,"[Who Wants To Live Forever - Remastered 2011, ...",UAE
1,AU,37001,95630.450367,1747,"[Up and Away, Keep It Down Low, No Love In The...",Australia
2,BG,16,61.513867,9,"[A Kind Of Magic - Remastered 2011, Sial, Heav...",Bulgaria
3,CZ,86,288.384183,26,"[Sial, No Scrubs, Blue, Ke Hujung Dunia, Terus...",Czechia
4,DE,134,355.464833,63,"[Budapest, Sial, Night Changes, Unstoppable, A...",Germany
5,FR,589,1380.346717,135,"[La Vie En Rose, Take Me to Church, Sial, Ke H...",France
6,HU,52,179.775133,9,"[Lose You To Love Me, Budapest, La Vie En Rose...",Hungary
7,ID,669,2050.555633,163,"[I Say a Little Prayer, Heartless, Sumpah, Ter...",Indonesia
8,IN,47,121.568517,10,"[Bohemian Rhapsody - Remastered 2011, Fat Bott...",India
9,IT,87,137.3718,21,"[Influencer, Felpa champion, Bello sfigo, Ros√É...",Italy


In [35]:
#save df to a csv file
csv_file_path = './Dataset/country_summary.csv'
country_summary.to_csv(csv_file_path, index=False)

In [17]:
# countries = country_summary['country']
# countries

country_summary.columns

Index(['country', 'total_songs', 'total_minutes', 'distinct_artists',
       'distinct_songs', 'country_name'],
      dtype='object')

In [36]:
# How much I use spotify while traveling/living in a country

# aggregate total minutes by country
agg_df = country_summary.groupby('country_name')['total_minutes'].sum().reset_index()
agg_df_sorted = agg_df.sort_values(by='total_minutes', ascending=False)

agg_df_sorted

Unnamed: 0,country_name,total_minutes
9,Malaysia,117491.706167
0,Australia,95630.450367
7,Indonesia,2050.555633
11,Thailand,1860.201267
13,UAE,1522.718533
3,France,1380.346717
12,Turkey,393.4938
4,Germany,355.464833
2,Czechia,288.384183
5,Hungary,179.775133


In [19]:
# spotify usage - bar

# fig = px.bar(
#     agg_df_sorted, x='total_minutes', y='country_name', orientation='h',
#     title='Total Spotify Usage by Country',
#     labels={'total_minutes': 'Total Minutes', 'country_name': 'Country'})

# fig.show()

In [40]:
# spotiy usage - bubbles 

fig = px.scatter(
    country_summary, x='country_name', y='total_minutes', 
    size='total_songs', color='total_songs', size_max=100,
    hover_name='country_name', title='Total Spotify Usage based on Location (Country)',
    labels= {
        'total_minutes': 'Total Minutes',
        'country_name': 'Country',
        'total_songs': 'Total Songs'
    }
)

fig.update_layout(
    title={
        'text': f'Total Spotify Usage based on Location', 
        'x': 0.5,
        'xanchor':'center', 
        'font': {
            'size': 25, 
            'weight': 'bold'}
    },
    plot_bgcolor='rgb(30, 30, 30)',  # Dark background
    paper_bgcolor='rgb(30, 30, 30)',  # Dark paper background
    font=dict(color='white'),  # White font color
    height=450,  # Adjust height
    width=550,
    xaxis=dict(
        showgrid=True,
        gridcolor='rgb(169, 169, 169)',
        tickangle=45
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='rgb(169, 169, 169)'
    )
)

fig.show()

In [21]:
print(country_summary.columns)

#check the column contain single values (strings & integers) and not array
print(country_summary.dtypes)

# print(country_summary[['distinct_songs', 'total_songs']].head())

Index(['country', 'total_songs', 'total_minutes', 'distinct_artists',
       'distinct_songs', 'country_name'],
      dtype='object')
country              object
total_songs           int64
total_minutes       float64
distinct_artists      int64
distinct_songs       object
country_name         object
dtype: object


# Error

- I was trying to create a dashboard where we can select a countyr from a dropdown to see top 15 songs played while I was in that particular country
- I was using country_summary data frames - it has total distinct songs played and duration of spotify consumption BUT it does not contain the list of songs played in eahc country, i cannot use this dataframe to create dashboard as I wanted to see individual song 

**Solution**
- idk if it's going to work but I'm going to create another df/csv file that has proper spelling of the country
- and take it from there
- **update**: It works! ü•≥ü•≥ü•≥



In [22]:
df_year['country_name'] = df_year['country'].map(iso_to_country)

df_year.head(1)

Unnamed: 0,duration,country,song,artist,album,date,time,duration_minutes,year,country_name
0,29381,AU,Up and Away,Envy,The Magic Soup And The Bittersweet Faces,2014-08-26,11:22:33,0.489683,2014,Australia


In [23]:

# #save df to a csv file 
# csv_path = './Dataset/country_year_name.csv'
# df_year.to_csv(csv_path)

df_year.columns

Index(['duration', 'country', 'song', 'artist', 'album', 'date', 'time',
       'duration_minutes', 'year', 'country_name'],
      dtype='object')

In [42]:
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
import pandas as pd

#initialize the dash app
app = dash.Dash(__name__)

#define the app layout 
app.layout = html. Div([
    html.H1("Top 15 songs I listened to by Location"),
    
    dcc.Dropdown(
        id='country-dropdown',
        options= [{'label': country, 'value': country} for country in df_year['country_name'].unique()],
        value=df_year['country_name'].unique()[0]
    ),
    
    dcc.Graph(id='top-songs-graph')
])

#define callback to update graph based on the selected county 
@app.callback(
    Output('top-songs-graph', 'figure'),
    Input('country-dropdown', 'value')
)

def update_graph_ctry(selected_country2):
    filtered_ctry = df_year[df_year['country_name'] == selected_country2]
    
    #get top 15 songs with artists name
    # top_songs = filtered_ctry['song'].value_counts().head(15)
    top_songs = filtered_ctry.groupby(['song', 'artist']).size().reset_index(name='count')
    top_songs = top_songs.sort_values(by='count', ascending=False).head(15)
    
    #create horizontal bar chart
    fig = px.bar(
        top_songs, 
        y='song',
        x='count',
        color='song',
        labels={'song': 'Song', 'count': 'Frequency'},
        title=f'Top 15 songs in {selected_country2}',
        text='artist',
        hover_data={'song': True, 'count': True, 'artist': True},
        orientation = 'h'
    )
    
    fig.update_layout(
        showlegend=False,
        title={
            'text': f'Top 15 songs in {selected_country2}', 
            'x': 0.5,
            'xanchor':'center', 
            'font': {
                'size': 30, 
                'weight': 'bold'}
        },
        width=600
    ), 
    
    
    #update text font size
    fig.update_traces(textfont_color='white')
    
    return fig

#run the app
if __name__ == '__main__':
    app.run_server(debug=True, port=8059)

**race bar chart for top 15 artist each year**

- it looks suck to be honest

In [25]:
import pandas as pd
import plotly.express as px

race_chart = pd.read_csv(r'./Dataset/top_artist_2014_2024.csv')

race_chart.head()

Unnamed: 0,year,artist,song_count
0,2014,Yuna,860
1,2014,Beyonc√©,673
2,2014,Ed Sheeran,553
3,2014,Norah Jones,274
4,2014,Destiny's Child,248


In [26]:
fig = px.bar(
    race_chart,
    x='song_count',
    y='artist',
    color='artist',
    animation_frame='year',
    range_x=[0, race_chart['song_count'].max()],
    title='Top 15 Artists by Year',
    labels={'song_count': 'Song Count'},
    orientation='h',
    height=700,
    template='plotly_dark'
)

fig.show()