# Music Popularity Analysis

<hr style="border:2px solid black"> </hr>

## Notebook 05 - API Join

---

### Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
def num_uniques(ser):
    try:
        return len(ser.unique())
    except:
        return "Not unique check-able"

In [None]:
def summarize_df(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======COLUMNS======")
    print(df.dtypes)
    print("\n======PREVIEW======")
    display(df.head())
    print("\n======NUMERICAL COL SUMMARY======")
    print(df.describe())
    print("\n")
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

In [None]:
def quicksum(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======PREVIEW======")
    display(df.head())
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

---
### Import data

In [None]:
# Read in last.fm release + track + play count
lfm_count = pd.read_csv('gid_name_art_count.csv', usecols=[1, 2, 3, 4], skiprows=1,
                        names=['mb_release_gid', 'lfm_track_name', 'lfm_artist_name', 'lfm_play_count'])
summarize_df(lfm_count)

In [None]:
# Aggregate last.fm release + track + play count
lfm_rel_name = lfm_count[['mb_release_gid', 'lfm_track_name', 'lfm_artist_name', 'lfm_play_count']].groupby(['mb_release_gid', 'lfm_track_name' , 'lfm_artist_name', 'lfm_play_count'], as_index=False).size()
summarize_df(lfm_rel_name)
lfm_rel_name = lfm_rel_name.drop('size', axis=1)

In [None]:
# Cleansing to prep for fuzzy matching (lowercase, alphanumeric)
lfm_rel_name['lfm_rel_name_prcs'] = lfm_rel_name['lfm_track_name'].apply(lambda x: ''.join([s.lower() for s in x if s.isalnum()]))

In [None]:
# Custom function to generate a list of track names and dictionary of track names to play counts
def groupfunc(df):
    d = {}
    d['mb_release_gid'] = df['mb_release_gid']
    d['lfm_track_names'] = df['lfm_rel_name_prcs'].values.tolist()
    d['lfm_play_counts'] = dict([(k, df['lfm_play_count'].values.tolist()[i]) for i, k in enumerate(df['lfm_rel_name_prcs'].values.tolist())])
    return pd.Series(d)

In [None]:
lfm_rel_ids = lfm_rel_name.groupby('mb_release_gid', as_index=False).apply(groupfunc)
lfm_rel_ids['mb_release_gid'] = lfm_rel_ids['mb_release_gid'].apply(lambda x: x.iloc[0])
quicksum(lfm_rel_ids)

In [None]:
sp_track_artist = pd.read_csv('uri_artist.csv', usecols=[1, 2])
quicksum(sp_track_artist)

In [None]:
sp_track_artist_c = sp_track_artist.groupby(['sp_artist_uri', 'sp_track_uri'], as_index=False).size()
quicksum(sp_track_artist_c)
sp_track_artist_c = sp_track_artist_c.drop('size', axis=1)

In [None]:
sp_artist_info = pd.read_csv('artisturi_artistinfo.csv', usecols=[1, 2, 3, 4])
quicksum(sp_artist_info)

In [None]:
sp_artist_info_c = sp_artist_info.groupby(list(sp_artist_info.columns), as_index=False).size()
quicksum(sp_artist_info_c)
sp_artist_info_c = sp_artist_info_c.drop('size', axis=1)

In [None]:
sp_track_audiof = pd.read_csv('isrc_uri_audiofeats.csv', usecols = range(1, 15))
quicksum(sp_track_audiof)

In [None]:
sp_track_audiof_c = sp_track_audiof.groupby(list(sp_track_audiof.columns), as_index=False).size()
quicksum(sp_track_audiof_c)
sp_track_audiof_c = sp_track_audiof_c.drop('size', axis=1)

In [None]:
sp_uri_isrc = pd.read_csv('isrc_uri.csv', usecols=[1, 2])
quicksum(sp_uri_isrc)

In [None]:
sp_uri_isrc_c_1 = sp_uri_isrc.groupby(['isrc', 'sp_uri'], as_index=False).size().drop('size', axis=1)
sp_uri_isrc_c = sp_uri_isrc_c_1.groupby('isrc', as_index=False).agg({'sp_uri': ['first', 'count']})
sp_uri_isrc_c.columns = ['_'.join(col).strip('_') for col in sp_uri_isrc_c.columns.values]
sp_uri_isrc_c = sp_uri_isrc_c[sp_uri_isrc_c['sp_uri_count'] == 1].drop('sp_uri_count', axis=1)
quicksum(sp_uri_isrc_c)

In [None]:
mb_db_songs = pd.read_parquet('mb_db_songs_2019.parquet')[['release_gid', 'isrc', 'track_name']] \
                .rename({'release_gid': 'mb_release_gid', 'track_name':'mb_track_name'}, axis=1)
quicksum(mb_db_songs)

In [None]:
working_df = mb_db_songs.copy()
working_df['mb_track_name'] = working_df['mb_track_name'].fillna('')
working_df['mb_track_name_prcs'] = working_df['mb_track_name'].apply(lambda x: ''.join([s.lower() for s in x if s.isalnum()]))
working_df = working_df.merge(sp_uri_isrc_c, on='isrc')
quicksum(working_df)

In [None]:
working_df = working_df.merge(lfm_rel_ids, on='mb_release_gid')
quicksum(working_df)

In [None]:
working_df = working_df.groupby(['isrc', 'sp_uri_first'], as_index=False).first()
working_df.columns = [''.join(x) for x in working_df.columns]
quicksum(working_df)

In [None]:
working_df['match'] = working_df.apply(lambda x: process.extractOne(x['mb_track_name_prcs'], x['lfm_track_names']), axis=1)

In [None]:
working_df['matched'] = working_df['match'].apply(lambda x: x[0])
working_df['score'] = working_df['match'].apply(lambda x: x[1])
working_df['play_count'] = working_df.apply(lambda x: x['lfm_play_counts'][x['matched']], axis=1)

In [None]:
working_df['play_count'] = working_df.apply(lambda x: x['lfm_play_counts'][x['matched']], axis=1)

In [None]:
summarize_df(working_df)

In [None]:
display(working_df.sort_values('play_count', ascending=False).head(100))

In [None]:
working_df[working_df['score'] > 80].groupby(['mb_release_gid', 'matched', 'play_count']).size()

TO DO
- Step through collection and processing, clean up notebooks, and investigate any biases (also - double-check that data was 2019 and on)
- Preprocess text
- Conduct EDA / initial summarization
- Gather more data?
- Research and outline ML model types
- Perform preprocessing on each ML model
- Perform hyperparameter tuning on each ML model
- Compare results of models
    - Accuracy
    - Coefficient / parameter results
- Pick 'best' model and analyze results
    - How did it select? (Show 'inner workings')
    - What does it tell us?
    - What might we do next?
- Write up final report