# CS109a - Spotify Project - Scratch notebook

In [1]:
# imports
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import sqlalchemy as db
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Table, Column, Integer, String, MetaData
from sqlalchemy import create_engine
import sqlite3
from sqlalchemy.orm import sessionmaker

## Data read functions

### Spotify Data

In [9]:
''' The staff provided data has filenames 'songs0.csv' through 'songs999.csv'
    This function returns a single dataframe containing the desired CSV files.
    Args:
        1) file_nums:  list ... List of file #s to read (e.g. 0, 1, 2 ... 999); default is to just read 'songs0.csv'
        2) songs_dir:  str  ... Where the songs CSVs reside; default is 'songs/'
    
    You can pass a single parameter "ALL_FILES" (defined below) to simply get a full dataframe of all CSVs.
'''

def read_songs_csv(file_nums: list = [0], songs_dir: str = 'songs/') -> pd.core.frame.DataFrame:
    # check for valid input
    if any(i < 0 for i in file_nums) or any(i > 999 for i in file_nums) or len(file_nums) == 0:
        print('ERROR - read_songs_csv: Bad parameters; file numbers are out of bounds, or list is empty')
        return
    
    # put all listed songs CSV #s into a dataframe
    df = pd.concat((pd.read_csv(songs_dir + 'songs' + str(i) + '.csv') for i in file_nums))
    
    return df

In [10]:
# for convenience, constants to read all files or a randomly selected assortment of files
# (re-run cell for a different random assortment!)

# WARNING: this will take awhile and requires ~16+ GB RAM to be even feasible (and still not worth it!)
ALL_FILES = np.arange(0, 1000, 1) 

# reads N random files
N = 10
RANDOM = np.random.randint(0, 1000, N)

### LastFM

In [None]:
''' The lastFM data set is organized into individual JSON files, in nested directories.
    This function returns a single dataframe containing JSON data. It is designed to work with
    the pre-split train/test data sets for the subset of the 300GB million song data set.
    
    Args: lastfm_dir ... ROOT directory of the data containing letter folders
                        (e.g. lastfm_train or lastfm_test). Reads train data by default '''

def read_lastfm_json(lastfm_dir: str = 'lastfm_train') -> pd.core.frame.DataFrame:
    # put all jsons in a DF
    return 

In [12]:
import json

# test = pd.read_json('lastfm_train/A/A/A/TRAAAAK128F9318786.json')
# test

with open('lastfm_train/A/A/A/TRAAAAK128F9318786.json') as json_file:
    data = json.load(json_file)

test = pd.DataFrame(data, columns=data.keys())
test

ValueError: arrays must all be same length

## Reading data Examples - Spotify

In [11]:
# Read all files to DF in memory
#               ----------- commented out, just an example
#songs_df = read_songs_csv(ALL_FILES)
#songs_df.head()

# Read randomly assorted files

songs_df = read_songs_csv(RANDOM)
songs_df.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,0,0,Lil Uzi Vert,spotify:track:5wyrQLBbmKdxY6yLNdienj,spotify:artist:4O15NlyKLIASxsJ0PrXPfz,I Can Drive,spotify:album:6LX75kNicFqjjiAOeZgN67,153808,The Perfect LUV Tape
1,0,1,Young Thug,spotify:track:4cg1yakyRSIOjxKM2I7J1q,spotify:artist:50co4Is1HCEo8bhOyUWKpn,Digits,spotify:album:2z4c8M8aVzl7CTobIp36KF,176386,Slime Season 3
2,0,2,21 Savage,spotify:track:2kE82JhBcoS7mimFO1SqqX,spotify:artist:1URnnhqYAYcrqrcwql10ft,No Advance,spotify:album:4I3EcXD4e3KcEoDJfFEZ5b,276466,Savage Mode
3,0,3,Rich Chigga,spotify:track:7uuBfgY1vBCalrQRyyQhqs,spotify:artist:2IDLDx25HU1nQMKde4n61a,Gospel,spotify:album:79M1nv9rYaZvyt1GmyDAEa,173294,Gospel
4,0,4,Playboi Carti,spotify:track:1e1JKLEDKP7hEQzJfNAgPl,spotify:artist:699OTQXzgjhIYAHMy9RyPD,Magnolia,spotify:album:4rJgzzfFHAVFhCSt2P4I3j,181812,Playboi Carti


In [7]:
# Get some summary stats
summary_stats = {
    'total_records': songs_df['pid'].count(),
    'unique_artists': songs_df['artist_uri'].unique().size,
    'unique_songs': songs_df['track_uri'].unique().size,
    'unique_albums': songs_df['album_uri'].unique().size,
    'average_track_length': songs_df['duration_ms'].mean()
}
display(summary_stats)

{'total_records': 6649984,
 'unique_artists': 109410,
 'unique_songs': 683295,
 'unique_albums': 271958,
 'average_track_length': 234929.8703678084}

In [11]:
# Group by playlists
playlists_df = songs_df.groupby('pid').agg({
    'pos': np.mean,
    'duration_ms': np.mean
})

playlists_df

Unnamed: 0_level_0,pos,duration_ms
pid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,55.481721,231016.939017
1,56.614440,234642.595866
2,48.218701,235214.725133
3,51.795475,233915.773588
4,52.793873,235971.105573
5,46.131243,229379.479841
6,53.447182,235991.526640
7,52.855638,234893.126859
8,58.281458,239455.512413
9,49.904195,236778.802513
