In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from datetime import timedelta

#### Retrieve eligible users for prediction

In [2]:
train_dir = os.path.join(os.pardir, 'data', 'raw', 'train.csv')
s_users = pd.read_csv(train_dir, usecols = ['msno'])

In [3]:
current_users = s_users.msno.values

#### Compute monthly activity (song length count per month)

In [4]:
# create file path and compute total number of lines
user_log_dir = os.path.join(os.pardir, 'data', 'raw', 'user_logs.csv')
Total_rows = sum(1 for line in open(user_log_dir))
print('Number of rows = ', Total_rows)

Number of rows =  392106544


In [5]:
# number of rows
# Num_rows = 1e6
Num_rows = 20e6

In [6]:
reader_iter = pd.read_csv(user_log_dir, index_col = 'date', parse_dates=['date'], chunksize=Num_rows,\
        usecols = ['msno', 'date', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100'])

In [7]:
df_song_length = pd.DataFrame(index= pd.MultiIndex( levels=[[]]*2, labels = [[]]*2, names=['msno', 'date']), \
                 columns=['num_25', 'num_50', 'num_75', 'num_985', 'num_100']).astype('int64')
# dtype matches reader_iter inferred type (int64)

In [8]:
# get start time of timer for processing time
start_time = time()
row_counter = 0

for df_chunk in reader_iter:
    
    # keep track of progress
    row_counter += df_chunk.shape[0]
    
    # keep only eligible users
    df_chunk = df_chunk[df_chunk.msno.isin(current_users)]
    
#     # convert msno to category for our eligible users
#     df_chunk.msno = df_chunk.msno.astype(CategoricalDtype(categories=current_users))
#     # category codes of -1 means user is not eligible (msno will be Nan), filter them out
#     df_chunk = df_chunk[df_chunk.msno.cat.codes >= 0]

    # group by user and month
    df_song_length_chunk = df_chunk.groupby(['msno', pd.Grouper(level='date', freq='M')], sort = False).sum()
    
    # add last time serie to global time serie
    df_song_length = df_song_length.add(df_song_length_chunk, fill_value = 0)
    
    # print progress after each processed chunk
    print('\r {:10.1f}% done \t--- {} seconds ellapsed ---'.format(row_counter/Total_rows*100,\
                                timedelta(seconds = time() - start_time)) ,end='', flush=True)

      100.0% done 	--- 0:36:02.802240 seconds ellapsed ---

#### Pivot dates

In [9]:
df_unstack = df_song_length.unstack(level='date', fill_value=0)

In [10]:
Userlog_inter_dir = os.path.join(os.pardir, 'data', 'interim', 'song_length_fulldate_rawcount.p34')
df_unstack.to_pickle(Userlog_inter_dir)
# df_unstack = pd.read_pickle('song_length_fulldate_rawcount.p34')

In [11]:
# recover only the last 6 months labels
past_6_month = df_unstack.columns.get_level_values(level = 1).unique()[:-6]

In [12]:
# discard months prior to 6 months
df_unstack = df_unstack.drop(labels=past_6_month, axis = 1, level = 1)

NOTE: There are users who have a membership but did not show any activity

In [13]:
# number of eligible users who have no activity
num_missing_users = len(current_users) - df_unstack.shape[0]
print('Number of missing users =', num_missing_users)

Number of missing users = 123005


#### Add cumulative sum of count for each song length. 
For instance `cumu50` will be the number of songs played up to 50% of its length.

In [14]:
# retrieve levels of song length
song_length_level = df_unstack.columns.get_level_values(level = 0).unique()

In [15]:
# used for multi-indexing
idx = pd.IndexSlice

# derive cumulative count for each song length
for sl in range(2,len(song_length_level)+1):
    
    # get the number of song length to accumulate
    cum_length = song_length_level[:sl]

    # cumulative sum over specified song length for each month (level 1)
    df_cum = df_unstack.loc[:, idx[cum_length, :]].sum(axis = 1, level = 1)

    # create new level name
    cum_name = ''.join(['cum',cum_length[-1]])
    
    # add higher hierarchical index
    df_cum.columns = pd.MultiIndex.from_product([[cum_name], df_cum.columns])

    # concatenate new features with the others
    df_unstack = pd.concat([df_unstack[:3], df_cum], axis = 1)

#### Average count per day over monthly period

In [16]:
# average count by number of days
df_unstack = df_unstack.divide(df_unstack.columns.get_level_values(level = 1).daysinmonth)

In [17]:
Userlog_proc_dir = os.path.join(os.pardir, 'data', 'processed', 'song_length.p34')
df_unstack.to_pickle(Userlog_proc_dir)