In [46]:
import os

import numpy as np
import pandas as pd

## Consolidate daily charts data to monthly files

In [47]:
months_start = pd.date_range(start='2019-09-01', end='2020-09-25', freq='MS')
months_end = pd.date_range(start='2019-09-01', end='2020-09-30', freq='M')

In [48]:
months_end

DatetimeIndex(['2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31',
               '2020-09-30'],
              dtype='datetime64[ns]', freq='M')

In [49]:
for ms,me in zip(months_start, months_end):
    print("Processing %s..." % ms.strftime("%b %Y"))
    
    #generate month dates
    month_dates = pd.date_range(start = ms,end = me , freq='D')
    #initialize DataFrame list
    df_list = []
    #open and clean chart data for all dates in month dates
    for date in month_dates:
        try:
            df = pd.read_csv("data/daily_charts_raw/ph-"+date.strftime("%Y-%m-%d")+".csv")
            df['track_id'] = df['URL'].apply(lambda x: x.split('/')[4])
            df.columns=[col.lower().replace(' ','_') for col in df.columns]
            df = df[['date','position','track_id','track_name','artist','streams']]
            df_list.append(df)
        except:
            print("\tChart not available for date %s..." % date.strftime("%Y-%m-%d"))
        
    month_df = pd.concat(df_list)
    month_df.to_csv('data/monthly/'+'ph_'+ms.strftime("%Y%m")+".csv", index=False, encoding='utf-8')

#delete df list to clear up memory
del df_list

Processing Sep 2019...
Processing Oct 2019...
Processing Nov 2019...
Processing Dec 2019...
Processing Jan 2020...
Processing Feb 2020...
Processing Mar 2020...
Processing Apr 2020...
Processing May 2020...
Processing Jun 2020...
Processing Jul 2020...
Processing Aug 2020...
Processing Sep 2020...
	Chart not available for date 2020-09-16...
	Chart not available for date 2020-09-17...
	Chart not available for date 2020-09-18...
	Chart not available for date 2020-09-19...
	Chart not available for date 2020-09-20...
	Chart not available for date 2020-09-21...
	Chart not available for date 2020-09-22...
	Chart not available for date 2020-09-23...
	Chart not available for date 2020-09-24...
	Chart not available for date 2020-09-25...
	Chart not available for date 2020-09-26...
	Chart not available for date 2020-09-27...
	Chart not available for date 2020-09-28...
	Chart not available for date 2020-09-29...
	Chart not available for date 2020-09-30...


> Q: Can you revise the loop to concatenate daily chart data to *quarterly* chunks?

## Consolidate monthly data to a single file

In [50]:
df_list = []
for ms in months_start:
    print("Appending %s..." % ms.strftime("%b %Y"))
    df = pd.read_csv('data/monthly/'+'ph_'+ms.strftime("%Y%m")+".csv")
    df_list.append(df)
        
all_df = pd.concat(df_list)
all_df.to_csv('data/spotify_daily_charts.csv', index=False, encoding='utf-8')

#delete df list to clear up memory
del df_list    

Appending Sep 2019...
Appending Oct 2019...
Appending Nov 2019...
Appending Dec 2019...
Appending Jan 2020...
Appending Feb 2020...
Appending Mar 2020...
Appending Apr 2020...
Appending May 2020...
Appending Jun 2020...
Appending Jul 2020...
Appending Aug 2020...
Appending Sep 2020...


In [51]:
all_df.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams
0,2019-09-01,1,6IdEQ3HUseBeRUYFSzSMdL,Zebbiana,Skusta Clee,267548
1,2019-09-01,2,6v3KW9xbzN5yKLt9YKDYA2,Señorita,Shawn Mendes,259828
2,2019-09-01,3,1dGr1c8CrMLDpV6mPbImSI,Lover,Taylor Swift,240408
3,2019-09-01,4,0gDRtumoR9ZrvAlnniToMz,Ikaw At Ako,Moira Dela Torre,236726
4,2019-09-01,5,5l9g7py8RCblcvbZgGQgSd,Pagtingin,Ben&Ben,190049


In [52]:
all_df['date'].min(),all_df['date'].max()

('2019-09-01', '2020-09-15')

In [53]:
#check if consolidated file has expected number of rows
len(all_df), len(pd.date_range(start='2018-01-01', end='2020-09-15', freq='D'))*200

(76200, 197800)

### Resources

- String Formatting https://strftime.org/