In [None]:
# user_id = 'notbobbobby'

# GET PLAYLISTS
# playlists = get_user_playlists(user_id, [discover_database=True])

# GET TRACKS
# tracks = get_user_tracks(user_id, playlists)

# GET FEATURES
# features = get_user_unique_tracks_and_features(user_id, tracks)

# COMBINE INTO ONE DATAFRAME
# df = pd.merge(tracks, playlists, how='outer', on=['playlist_id', 'playlist_name'])
# df = pd.merge(df, features, how='outer', on=['track_id', 'track_name'])

# Note: indieair showed missing songs, which indicates the discrepancies between
#       total_tracks of the playlists and output number of tracks

import os
import pandas as pd
import altair as alt

ROOT = os.getcwd()
def get_data(db, user, time, kind='library'):
    if db == '_history':
        path = os.path.expanduser(os.path.join(ROOT, db, user, '{}_{}_{}.json'.format(user, time, kind)))
    else:
        path = os.path.expanduser(os.path.join(ROOT, db, user, time, '{}_{}_{}.json'.format(user, time, kind)))
    return pd.read_json(path, orient='records', lines=True)

library = get_data('_db', 'notbobbobby', '2018-10-31')
discover = get_data('_db', 'discover', '2018-10-31')

history_n = get_data('_history', 'notbobbobby', '2018', kind='history')
history_d = get_data('_history', 'deedanvy', '2018', kind='history')
history_c = get_data('_history', 'c.bochulak', '2018', kind='history')
history = pd.concat([history_n, history_d, history_c])

from datetime import timedelta
history['played_at'] = history['played_at'].apply(lambda t: t - timedelta(hours=6))

In [None]:
# Let's begin by looking at duplicated songs

dup = discover[discover.duplicated(subset=['track_id', 'discover_id'], keep=False)] \
    .groupby(['track_id', 'track_name', 'artists', 'discover_id']) \
    .agg({'track_id': 'count'}) \
    .rename({'track_id': 'occurrence'}, axis=1) \
    .reset_index() \
    .sort_values(by='occurrence', ascending=False) \

dup.head()

In [None]:
# First step: cross-correlation
# Table needs to look like: [track_id, track_name, artists, album] [first_occurrence, discover_id] [second_occurrence, discover_id]

# REQUIRED: Sort tracks by created_at. Otherwise, duplicates WILL NOT be filtered correctly
sorted_tracks = discover.sort_values(by='created_at', ascending=True).reset_index()

# Sub-table: [track_id, track_name, artists, album] [first_occurrrence, discover_id]
dup_first = sorted_tracks[
    sorted_tracks.duplicated(subset='track_id', keep=False) &
    ~sorted_tracks.duplicated(subset='track_id', keep='first')] \
        [['track_id', 'track_name', 'album', 'artists', 'created_at', 'discover_id']] \
        .rename({'created_at': 'first_occurrence', 'discover_id': 'first_id'}, axis=1)

# Sub-table: [track_id, second_occurrence, discover_id]
dup_second = sorted_tracks[
    sorted_tracks.duplicated(subset='track_id', keep='first')] \
        [['track_id', 'created_at', 'discover_id']] \
        .rename({'created_at': 'second_occurrence', 'discover_id': 'second_id'}, axis=1)

# Merge tables
# REMOVE: all NaN results come from there being a 'first', but no 'second' occurrence
timeline = pd \
    .merge(dup_first, dup_second, how='outer', on='track_id') \
    .dropna(axis=0)

In [None]:
people = ['notbobbobby', 'deedanvy']
colors = {
    'notbobbobby': '#1DB954',
    'deedanvy': '#883677',
    'c.bochulak': '#16BAC5',
    'eriica_k': 'steelblue'}

subset = timeline[
    timeline.first_id.isin(people) &
    timeline.second_id.isin(people)]

alt.Chart(subset) \
    .mark_point() \
    .encode(
        x=alt.X('first_occurrence:T', axis=alt.Axis(title='first occurrence')),
        y=alt.Y('second_occurrence:T', axis=alt.Axis(title='second occurrence')),
        color=alt.condition(
            'datum.first_occurrence == datum.second_occurrence',
            alt.value('lightgray'),
            'first_id:N',
            legend=alt.Legend(title='first heard by'),
            scale=alt.Scale(
                domain=people,
                range=[colors[p] for p in people])),
        tooltip=[
            alt.Tooltip('track_name', title='track'),
            alt.Tooltip('first_occurrence:T', title='First', format='%Y_%m_%d'),
            alt.Tooltip('second_occurrence', title='Next', format='%Y_%m_%d')]) \
    .properties(title='who listened first')
    