## Read in data

In [1]:
import pandas as pd
import pickle
from datetime import datetime
from time import mktime
from nltk.metrics import edit_distance
from siuba import *


lib1 = pickle.load(open('../data/2016-07-15/itl.p', 'rb'))
lib2 = pickle.load(open('../data/2019-07-31/itl.p', 'rb'))

%load_ext blackcellmagic

In [2]:
raw_data1 = [s[1].ToDict() for s in lib1.songs.items()]
raw_data2 = [s[1].ToDict() for s in lib2.songs.items()]

In [3]:
convert_date = (_
    .dropna()
    .apply(lambda x: datetime.fromtimestamp(mktime(x)))
    .astype("datetime64[ns]")
)

clean_all = (
    mutate(
        date_added = convert_date(_.date_added),
        date_modified = convert_date(_.date_modified),
        skip_date = convert_date(_.skip_date)
    ) >>
    select(
        _.contains("date"),
        _.artist, _.name, _.album, _.genre, _.kind,
        _.persistent_id, _.play_count,
        _.rating, _.skip_count, _.track_id,
        _.track_number, _.year
    ) >>
    filter(~_.date_added.isna()) # Shared playlist songs shouldn't be included
)


data1 = (
    pd.DataFrame(raw_data1) >>
    clean_all
)

data2 = (
    pd.DataFrame(raw_data2) >>
    clean_all
)

In [4]:
(
    data2 >>
    count(added_year = _.date_added.dt.year)
)

Unnamed: 0,added_year,n
0,2006,3
1,2007,2
2,2008,3
3,2009,3
4,2010,6
5,2011,133
6,2012,171
7,2013,114
8,2014,48
9,2015,48


In [105]:
pickle.dump(data1, open("../data/data1.p", "wb"))
pickle.dump(data2, open("../data/data2.p", "wb"))

## Match songs from 2016 <=> 2019

There should be three dataframes that will need to be concatenated:
1. `joined_on_pid`, the songs that were not changed (they had the same persistent_id in 2016 as 2019);
2. `joined`, the songs that were matched (exactly) using artist and name (excluding `joined_on_pid`);
3. And `manually_joined`, the songs from `data1` that are not included in the above DFs that are manually matched to new versions.

The goal is ensure all the songs pre-Apple music have accurate metadata. Songs added after the switch to AM are almost exclusively AM songs.

In [14]:
# (
#     data1 >>
#     inner_join(_, data2, on = "track_id") >>
#     pipe(lambda d: d[sorted(d.columns)])
# )
# Track id is not enough to use as a join key

In [5]:
# Use persistent_id to match songs, then remove those from future attempts to join
joined_on_pid = (
    data1 >>
    inner_join(_, data2, on = "persistent_id") >>
    pipe(lambda d: d[sorted(d.columns)])
)

In [40]:
joined_raw = (
    data1
    # >> anti_join(joined_on_pid, on = 'persistent_id')
    >> left_join(
        _,
        joined_on_pid >> transmute(_.persistent_id, in_pid_table=True),
        on="persistent_id",
    )
    >> filter(_.in_pid_table.isna())
    >> inner_join(_, data2, on=["name", "artist"])
    >> pipe(lambda d: d[sorted(d.columns)])
)

In [46]:
# Manually correct problem children from the join
pd.options.display.max_columns = None
from siuba.dply.vector import n
# n(_) is the siuba equivalent of shape[0]

# We will be looping over persistent_id_y.
# Manually confirm in a CSV that all of the songs matched to a single persistent_id_y should in fact be matched.
(
    joined_raw 
    >> group_by(_.persistent_id_y)
    >> mutate(n = _.shape[0]) 
    >> ungroup()
    >> arrange(-_.n, _.artist, _.name)
#     >> filter(_.n > 1)
).to_csv('../data/manual_cleaning/joined_raw.csv')

In [6]:
# Read in cleaned CSV
joined_clean = (
    pd.read_csv('../data/manual_cleaning/joined_clean.csv') 
    >> filter(_.remove != 1) 
    >> select(-_.remove)
)

In [7]:
# Get everything from data1 that is not in the above two DataFrames.
needs_to_be_joined = data1 >> filter(
    ~_.persistent_id.isin(joined_on_pid.persistent_id),
    ~_.persistent_id.isin(joined_clean.persistent_id_x),
)

In [114]:
pickle.dump(
    needs_to_be_joined
    >> rename(persistent_id_x = "persistent_id")
    >> mutate(persistent_id_y = None), 
    open("../data/manual_cleaning/needs_to_by_joined.p", "wb")
)

In [24]:
# Take edit_distance out for a spin
def calculate_distance(s): 
    return (_
        .dropna()
        .apply(lambda x: edit_distance(x, s))
    )
artist_distance = calculate_distance('Deux')

test = (
    joined_on_pid
    >> mutate(distance=artist_distance(_.artist_x))
#     >> mutate(artist_distance_ntile=pd.qcut(_.distance, 10, labels=list(range(10))))
)

In [1]:
# Read in cleaned CSV
# ERRORS: sixteen, what a wonderful world, goyim friends, concert for george (got auto-matched)
# Add back music i've made (roundup, stockade, mashups)

## Update metadata

Metadata to change:
- date_added
- play_count
- skip_count
- genre

We will group by the new `persistent_id` and aggregate data for each of the above columns to create an output CSV that reflects the final changes.

#### date_added

In [None]:
# Check for albums with that have different added_at dates - chances are most albums should be added at the same date.
(
    joined_on_pid >>
        mutate(date_added = _.apply(lambda d: d.date_added_x if (d.date_added_x < d.date_added_y) else d.date_added_y))
#         mutate(date_added = case_when({
#                 _.date_added_x < _.date_added_y: _.date_added_x,
#                 True: _.date_added_y
#         }))
#         count(t = _.date_added_x > _.date_added_y)
#         mutate(date_added = )
)

#### Play Count

In [20]:
# for joined_on_pid, we want to keep the most recent playcount number.
# If it is the same exact song, keep the largest of the two. 
# If it is a different song (different filetype, different pid) sum them.
(
    joined_on_pid 
    >> select(_.contains("play_count")) 
    >> count(old_plays_is_larger = _.play_count_x > _.play_count_y)
)

Unnamed: 0,old_plays_is_larger,n
0,False,540


#### Skip Count

In [21]:
# for joined_on_pid, we want to keep the most recent skip count number.
(
    joined_on_pid 
    >> select(_.contains("skip_count")) 
    >> count(old_skips_is_larger = _.skip_count_x > _.skip_count_y)
)

Unnamed: 0,old_skips_is_larger,n
0,False,540


#### Genre

In [26]:
(
    data2
    >> count(_.genre, sort = True)
    >> arrange(_.genre, _.n)
)

Unnamed: 0,genre,n
0,Action & Adventure,2
1,Adult Contemporary,2
2,Alternative,319
3,Bluegrass,1
4,Blues,4
5,Children's Music,10
6,Christian & Gospel,3
7,Classical,44
8,Comedy,11
9,Country,159


In [None]:
genre_map = {
    'East Coast Rap': 'Hip-Hop/Rap',
    'Gangsta Rap': 'Hip-Hop/Rap',
    'Hardcore Rap': 'Hip-Hop/Rap',
    'Hip Hop/Rap': 'Hip-Hop/Rap',
    'Hip-Hop': 'Hip-Hop/Rap',
    'Hip-hop & Rap': 'Hip-Hop/Rap',
    'Rap': 'Hip-Hop/Rap',
    'R&B': 'R&B/Soul',
    'Soul': 'R&B/Soul',
    'Traditional Folk': 'Folk',
    'Trance': 'Dance',
}

### Hunt for duplicates

## Why are some added dates missing from the 2019 lib?
Answer: Songs are included here even if `added_at` is NA. This happens when you add shared playlists without adding individual songs to your library.

In [5]:

# print(
#     data1.shape,
#     data2.shape,
#     joined.shape
#     )

print(
    data1.date_added.isna().sum(),
    data2.date_added.isna().sum()
    )

0 0


In [11]:

(
    data2
    >> group_by(_.artist)
    >> summarize(ttl_na=_.date_added.isna().sum())
    >> arrange(-_.ttl_na)
).head()

Unnamed: 0,artist,ttl_na
0,"""Weird Al"" Yankovic",0
1,10cc,0
2,2 Chainz,0
3,21 Savage,0
4,21 Savage & Metro Boomin,0


In [12]:
# data2 >> filter(_.artist == "Mura Masa")

Aha! Songs for shared playlists appear in the library but have not been "added". It is safe to exclude them.