# Exploratory Data Analysis

- Distribution of Song Duration
    - Unknown songs was messing up my data bcs the duration went as far as 200minutes which doesnt make any sense
    - i removed them from my data altogether

- top artist, album and song
    - I shortened this song name 'I Donâ€™t Wanna Live Forever (Fifty Shades Darker) - From "Fifty Shades Darker (Original Motion Picture Soundtrack)"'
    - I shortedned this album name 'I AM...SASHA FIERCE - Platinum Edition'

- Song listening trend by country

- Listening trends over time (line plot) - this looks so weird

- Listening activity heatmap - this is weird too

- Song density by country

In [2]:
import pandas as pd
import plotly.express as px

big_data2=pd.read_csv(r'./Dataset/big_data2.csv', encoding='latin1')    #Encoding was added bcs i changed BEYONCÉ's spelling, it doesnt recognize the letter 

big_data2 = big_data2.drop(columns=['Unnamed: 0'])

#convert miliseconds to minutes
big_data2['duration_minutes'] = big_data2['duration'] / 60000

big_data2.head()

Unnamed: 0,duration,country,song,artist,album,date,time,duration_minutes
0,29381,AU,Up and Away,Envy,The Magic Soup And The Bittersweet Faces,26/8/2014,11:22:33,0.489683
1,13458,AU,Keep It Down Low,Envy,The Magic Soup And The Bittersweet Faces,26/8/2014,11:22:47,0.2243
2,3436,AU,No Love In The Club,Envy,The Magic Soup And The Bittersweet Faces,26/8/2014,11:22:49,0.057267
3,247520,AU,Am I Wrong,Nico & Vinz,Am I Wrong,26/8/2014,11:26:56,4.125333
4,177352,AU,So Everything,Envy,The Magic Soup And The Bittersweet Faces,26/8/2014,11:29:54,2.955867


In [3]:
print(big_data2.columns)

#calculate the sum of minutes played
total_mins = big_data2['duration_minutes'].sum()
total_hours = total_mins / 60
total_days = total_hours / 24

# Count distinct artist
unique_artists = big_data2['artist'].nunique()

# count distinct songs
distinct_songs = big_data2['song'].nunique()

print(f'Total minutes played: {total_mins}')
print(f'Total hours played: {total_hours}')
print(f'Total days played: {total_days}')
print(f'Number of unique artists: {unique_artists}')
print(f'Number of unique songs: {distinct_songs}')

Index(['duration', 'country', 'song', 'artist', 'album', 'date', 'time',
       'duration_minutes'],
      dtype='object')
Total minutes played: 230057.0628166667
Total hours played: 3834.2843802777784
Total days played: 159.76184917824077
Number of unique artists: 2792
Number of unique songs: 8388


In [4]:
# max value 
max_dur = big_data2['duration_minutes'].max()
min_dur = big_data2['duration_minutes'].min()
print(max_dur)
print(min_dur)

#find that specific song 
long_song = big_data2[big_data2['duration_minutes'] == 242.87258333333332]
print(long_song)

#remove rows with unknown
big_data2 = big_data2[~big_data2[['song', 'artist', 'album']].isin(['Unknown Song', 'Unknown Artist', 'Unknown Album']).any(axis=1)]



242.87258333333332
0.0
       duration country          song          artist          album  \
75479  14572355      MY  Unknown Song  Unknown Artist  Unknown Album   

            date     time  duration_minutes  
75479  10/3/2024  3:50:18        242.872583  


In [5]:
# distribution of song duration

import plotly.express as px
fig = px.histogram(big_data2, x='duration_minutes', nbins=50, title='Distribution of Song Duration')
fig.show()


In [6]:
# top artist 
top_artists = big_data2['artist'].value_counts().nlargest(15)

#create bar chart
fig = px.bar(
    top_artists, x=top_artists.values, y=top_artists.index, 
    title='Top 15 Artist', orientation='h',
    color_discrete_sequence=['green'],
    text=top_artists.index
)

fig.update_traces(
    textfont=dict(size=10)  # Adjust the font size for the text on the bars
)

#update layout
fig.update_layout(
    xaxis_title='Play Count', yaxis_title='Artist',
    template='plotly_dark',
    title_font=dict(color='white'),
    title_x=0.5,
    xaxis=dict(showgrid=True, gridcolor='gray'),\
    yaxis=dict(showgrid=True, gridcolor='gray', showticklabels=False),
    paper_bgcolor='rgb(30, 30, 30)',
    plot_bgcolor='rgb(30,30,30)',
    width=250,
    margin=dict(l=20, r=20, t=50, b=20) 
    )

fig.write_html("static/charts/section1-artist.html")

fig.show()

In [12]:
top_album = big_data2.groupby(['album', 'artist']).size().reset_index(name='count')

top_album = top_album.nlargest(15, 'count')

fig = px.bar(
    top_album, x='count', y='album', title='Top 15 Album', 
    hover_data=['artist'], orientation='h', text='artist',
    color_discrete_sequence=['yellow'])

# Adjust the text size using update_traces
fig.update_traces(
    textfont=dict(size=10)  # Adjust the font size for the text on the bars
)

fig.update_layout(
    xaxis_title='Play Count', yaxis_title='Album',
    title_x=0.5,
    template='plotly_dark',
    title_font=dict(color='white'),
    xaxis=dict(showgrid=True, gridcolor='gray'),
    yaxis=dict(showgrid=True, gridcolor='gray', showticklabels=False),
    paper_bgcolor='rgb(30, 30, 30)',
    plot_bgcolor='rgb(30,30,30)',
    width=250,
    margin=dict(l=20, r=20, t=50, b=20) 
)

fig.write_html("static/charts/section1-album.html")

fig.show()

In [8]:
# top artist 
top_song = big_data2['song'].value_counts().nlargest(15)

#create bar chart
fig = px.bar(
    top_song, x=top_song.values, y=top_song.index, 
    title='Top 15 Song', orientation='h',
    color_discrete_sequence=['fuchsia'],
    text=top_song.index
)

fig.update_traces(
    textfont=dict(size=10)  # Adjust the font size for the text on the bars
)

#update layout
fig.update_layout(
    xaxis_title='Play Count', yaxis_title='Song',
    template='plotly_dark',
    title_font=dict(color='white'),
    title_x=0.5,
    xaxis=dict(showgrid=True, gridcolor='gray'),\
    yaxis=dict(showgrid=True, gridcolor='gray', showticklabels=False),
    paper_bgcolor='rgb(30, 30, 30)',
    plot_bgcolor='rgb(30,30,30)',
    width=250,
    margin=dict(l=20, r=20, t=50, b=20) 
    )

fig.write_html("static/charts/section1-song.html")

fig.show()

In [9]:
# Song listening trends by country
country_counts = big_data2['country'].value_counts()

fig = px.bar(country_counts, x=country_counts.index, y=country_counts.values, title='Number of Songs listened by Country I\'m in')
fig.update_layout(xaxis_title='Country', yaxis_title='Number of Songs')
fig.show()

In [10]:
# # Listening trends over time

# #group by date to count songs per day
# songs_per_day = big_data2.groupby('date').size().reset_index(name='count')
# songs_per_day = songs_per_day.sort_values('date')

# #create the line plot
# fig = px.line(songs_per_day, x='date', y='count', title='Number of Songs Played over time (2014 - 2024)')

# fig.update_layout(
#     xaxis_title='Date', 
#     yaxis_title='Number of Songs'
#     )

# fig.show()

In [11]:
# # song density based on country i was at

# # create a mapping of ISO alpha-2 codes to country names
# iso_to_country = {
#     'MY': 'Malaysia',
#     'AU': 'Australia',
#     'ID': 'Indonesia',
#     'TH': 'Thailand',
#     'AE': 'United Arab Emirates',
#     'FR': 'France',
#     'TR': 'Turkey',
#     'DE': 'Germany',
#     'IT': 'Italy',
#     'CZ': 'Czech Republic',
#     'HU': 'Hungary',
#     'IN': 'India',
#     'BG': 'Bulgaria',
#     'RO': 'Romania',
#     'SG': 'Singapore'
# }

# # replace the alpha-2 cods with country names
# big_data2['country_name'] = big_data2['country'].map(iso_to_country)

# #aggregate data by country name
# song_density = big_data2.groupby('country_name').size().reset_index(name='song_count')

# # choropleth map
# fig = px.choropleth(
#     song_density, 
#     locations='country_name',
#     locationmode='country',
#     color='song_count',
#     title='Song density by Country',
#     color_continuous_scale=px.colors.sequential.Plasma,
#     labels={'song_count':'Number of Songs Played'}
# )

# # update layout
# fig.update_layout(
#     geo=dict(
#         showcoastlines=True,
#         coastlinecolor='Black', 
#         showland=True,
#         landcolor='white'
#     )
# )

# fig.show()