In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
song = pd.read_json("song.json")

In [None]:
song.info()

In [None]:
song["user_sign_up_date"] = pd.to_datetime(song["user_sign_up_date"])
song["time_played"] = pd.to_datetime(song["time_played"])

In [None]:
song.sample(3)

In [None]:
song.groupby('user_state')['user_id'].nunique().reset_index().sort_values(by='user_id', ascending=False)[:3]

In [None]:
song.groupby('user_state')['user_id'].agg(['nunique','count']).reset_index().assign(engagement=lambda x: x['count']/x['nunique']).sort_values(by='engagement', ascending=False)[:3]

In [None]:
song.groupby('user_state')['user_id'].agg(['nunique','count']).reset_index().assign(engagement=lambda x: x['count']/x['nunique']).sort_values(by='engagement', ascending=True)[:3]

In [None]:
song.loc[song.groupby('user_state')['user_sign_up_date'].idxmin(),['user_state', 'user_id']] # For each group (i.e., each state), it finds the index of the minimum (earliest) user_sign_up_date.

In [None]:
df = song.copy()

In [None]:
df.sample(3)

In [None]:
df['next_song'] = df.sort_values(by=['user_id', 'time_played']).groupby('user_id')['song_played'].shift(-1)


In [None]:
# Drop rows where 'next_song' is NaN (i.e., the last song for each user)
df = df.dropna(subset=['next_song'])

# Calculate transition probabilities: count how often each song is followed by the next song
transition_counts = df.groupby(['song_played', 'next_song']).size().reset_index(name='count')

# Calculate the total count of each song being played
song_counts = df.groupby('song_played').size().reset_index(name='total_count')

# Merge to calculate probabilities
transition_counts = transition_counts.merge(song_counts, left_on='song_played', right_on='song_played')
transition_counts['probability'] = transition_counts['count'] / transition_counts['total_count']

# Define a function that takes a song and returns the most likely next song
def recommend_next_song(song_name):
    # Filter the transitions to get the probabilities for the given song
    possible_transitions = transition_counts[transition_counts['song_played'] == song_name]
    
    # Find the song with the highest probability
    next_song = possible_transitions.loc[possible_transitions['probability'].idxmax()]
    return next_song['next_song'], next_song['probability']

# Example usage: recommend the next song after "Revolution"
next_song, probability = recommend_next_song("Revolution")
print(f"The most likely next song after 'Revolution' is '{next_song}' with a probability of {probability:.2f}")

We need to perform a A/B test (reference):

- randomly split users into two groups, one Control group and one Experiment group
- Control group has no recommendation strategy
- Experiment group recommend the next song
- after running some time, perform a one-tailed t-test on 'average #play per hour'
    - H0: population 'average #play per hour' is same in two groups
    - H1: experiment group's population 'average #play per hour' is higher than control group's