## ANALYZING PERSONAL SPOTIFY DATA from 2023-04-18 to 2024-04-18

In [72]:
import pandas as pd
import matplotlib.dates as mdates
import seaborn as sns


In [73]:
data1 = pd.read_csv(r'/Users/aparnakalla/Downloads/StreamingHistory_music_0.csv')

In [74]:
data2 = pd.read_csv(r'/Users/aparnakalla/Downloads/StreamingHistory_music_1.csv')

In [75]:
data = pd.concat([data1, data2])

In [76]:
data

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-04-18 19:34,Ark Woods,First Flight To Mars,144232
1,2023-04-19 04:17,Kaifi Khalil,Kahani Suno 2.0,173637
2,2023-04-19 04:34,Charlie Puth,Light Switch,148654
3,2023-04-19 04:42,Charlie Puth,Light Switch,39186
4,2023-04-19 04:45,Mitraz,Alfaazo,158447
...,...,...,...,...
8916,2024-04-18 12:14,Sam Fischer,What Other People Say,1970
8917,2024-04-18 12:14,Madhur Sharma,Medley: Kehna Galat Galat / Halka Halka Suroor,480020
8918,2024-04-18 12:14,Ed Sheeran,Afterglow,780
8919,2024-04-18 12:17,Ed Sheeran,Afterglow,185486


In [77]:
data['endDate'] = data['endTime'].dt.date


print("Data is from " + str(min(data['endDate'])) + " to " + str(max(data['endDate'])))

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
data['endTime'] = pd.to_datetime(data['endTime'])

In [None]:
top_tracks = data.groupby('trackName')['msPlayed'].sum().sort_values(ascending=False).head(10)
top_tracks_hours = top_tracks / (1000 * 60 * 60)

# Display the top 10 tracks
print(top_tracks_hours)

In [None]:
top_artists = data.groupby(by='artistName')['msPlayed'].sum().sort_values(ascending=False).head(10)
top_artists_hours = top_artists / (1000 * 60 * 60)
print(top_artists_hours)

In [None]:
# Assuming an average song length of 3 minutes (180000 ms)
skipped_tracks = data[data['msPlayed'] < 100000]

# Calculate the percentage of skipped tracks
skipped_percentage = len(skipped_tracks) / len(data) * 100

# Display the skipped percentage
print(f"Percentage of skipped tracks: {skipped_percentage:.2f}%")


In [None]:
artist_days = data.groupby('artistName')['endTime'].nunique()

# Calculate the total number of days in the dataset
total_days = data['endTime'].dt.date.nunique()

# Calculate repeat listen rate for top 10 artists
top_artist_loyalty = (artist_days / total_days).sort_values(ascending=False).head(10)

# Display the top 10 artists by loyalty
print(top_artist_loyalty)


In [None]:
# Calculate unique artists and tracks listened to
unique_artists = data['artistName'].nunique()
unique_tracks = data['trackName'].nunique()

# Display the results
print(f"Total unique artists: {unique_artists}")
print(f"Total unique tracks: {unique_tracks}")


In [None]:
cutoff_date = pd.to_datetime('2024-01-01')

old_tracks = data[data['endTime'] < cutoff_date]
new_tracks = data[data['endTime'] >= cutoff_date]

# Compare top artists in old vs. new periods
old_top_artists = old_tracks.groupby('artistName')['msPlayed'].sum().sort_values(ascending=False).head(10)/(1000*60)
new_top_artists = new_tracks.groupby('artistName')['msPlayed'].sum().sort_values(ascending=False).head(10)/(1000*60)

old_top_artists_list = old_top_artists.index.tolist()
new_top_artists_list = new_top_artists.index.tolist()

consistent_fav_artists = list(set(old_top_artists_list) & set(new_top_artists_list))

# Display the results
print("Old top artists:")
print(old_top_artists)

print("\nNew top artists:")
print(new_top_artists)

print("\nConsistent favorite artists:")
print(consistent_fav_artists)


In [None]:
data = data.sort_values(by='endTime')

data['time_diff'] = data['endTime'].diff().dt.total_seconds() / 60  # Convert to minutes

data['new_session'] = (data['time_diff'] > 30).cumsum()

session_analysis = data.groupby('new_session').agg(
    session_start=('endTime', 'min'),
    session_end=('endTime', 'max'),
    session_duration=('msPlayed', lambda x: x.sum() / (1000 * 60)),  
    tracks_played=('trackName', 'count')
)


long_sessions = session_analysis[session_analysis['session_duration'] > 30]  


common_artists_long_sessions = data[data['new_session'].isin(long_sessions.index)].groupby('artistName').size().sort_values(ascending=False).head(10)
common_tracks_long_sessions = data[data['new_session'].isin(long_sessions.index)].groupby('trackName').size().sort_values(ascending=False).head(10)

print("\nTop Artists in Long Sessions:")
print(common_artists_long_sessions)

print("\nTop Tracks in Long Sessions:")
print(common_tracks_long_sessions)


In [None]:
session_analysis['day_of_week'] = session_analysis['session_start'].dt.day_name()

avg_duration_by_day = session_analysis.groupby('day_of_week')['session_duration'].mean().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

plt.figure(figsize=(10, 6))
avg_duration_by_day.plot(kind='bar')
plt.title('Average Session Duration by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Session Duration (Minutes)')
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
daily_playtime_hours.plot()

plt.title('Daily Playtime (Hours)')
plt.xlabel('Date')
plt.ylabel('Total Playtime (Hours)')

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())  # Set major ticks to each month
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))  # Format ticks as "Month Year"

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.show()


In [None]:
session_analysis['hour'] = session_analysis['session_start'].dt.hour

heatmap_data = session_analysis.groupby(['day_of_week', 'hour']).size().unstack().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)


plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="YlGnBu", cbar_kws={'label': 'Number of Sessions'})
plt.title('Listening Sessions by Day and Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Day of the Week')
plt.show()


In [None]:
data['hour'] = data['endTime'].dt.hour
hourly_playtime = data.groupby('hour')['msPlayed'].sum()/(1000*60*60)

# Plot the most active listening hours
plt.figure(figsize=(12, 6))
hourly_playtime.plot(kind='bar')
plt.title('Most Active Listening Hours')
plt.xlabel('Hour of Day')
plt.ylabel('Total Playtime (hours)')
plt.show()


In [None]:
data['rolling_playtime'] = data['msPlayed'].rolling(window=7).mean()/1000

# Plot the rolling average of daily playtime
plt.figure(figsize=(12, 6))
plt.plot(data['endTime'], data['rolling_playtime'])
plt.title('7-Day Rolling Average of Playtime')
plt.xlabel('Date')
plt.ylabel('Rolling Playtime (seconds)')
plt.show()
