In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from datetime import timedelta

#### Retrieve eligible users for prediction

In [2]:
train_dir = os.path.join(os.pardir, 'data', 'raw', 'train.csv')
s_users = pd.read_csv(train_dir, usecols = ['msno'])

In [3]:
current_users = s_users.msno.values

#### Compute monthly activity

In [4]:
# create file path and compute total number of lines
user_log_dir = os.path.join(os.pardir, 'data', 'raw', 'user_logs.csv')
Total_rows = sum(1 for line in open(user_log_dir))
print('Number of rows = ', Total_rows)

Number of rows =  392106544


In [5]:
# number of rows
# Num_rows = 1e6
Num_rows = 20e6

In [6]:
# USERS LOG IS SORTED BY CUSTOMER ID
# create file iterator
reader_iter = pd.read_csv(user_log_dir, index_col = 'date', parse_dates=['date'], chunksize=Num_rows,\
        usecols = ['msno', 'date', 'num_unq'])

In [7]:
df_song_unq = pd.DataFrame(index= pd.MultiIndex( levels=[[]]*2, labels = [[]]*2, names=['msno', 'date']), \
                 columns=['num_unq']).astype('int64')
# dtype matches reader_iter inferred type (int64)

In [8]:
# get start time of timer for processing time
start_time = time()
row_counter = 0

for df_chunk in reader_iter:
    
    # keep track of progress
    row_counter += df_chunk.shape[0]
    
    # keep only eligible users
    df_chunk = df_chunk[df_chunk.msno.isin(current_users)]

    # group by user and month
    df_song_unq_chunk = df_chunk.groupby(['msno', pd.Grouper(level='date', freq='M')], sort = False).sum()
    
    # add last time serie to global time serie
    df_song_unq = df_song_unq.add(df_song_unq_chunk, fill_value = 0)
    
    # print progress after each processed chunk
    print('\r {:10.1f}% done \t--- {} seconds ellapsed ---'.format(row_counter/Total_rows*100,\
                                timedelta(seconds = time() - start_time)) ,end='', flush=True)

      100.0% done 	--- 0:34:00.406870 seconds ellapsed ---

#### Pivot dates

In [21]:
df_unstack = df_song_unq.unstack(level='date', fill_value=0)

In [22]:
Uniquefull_inter_dir = os.path.join(os.pardir, 'data', 'interim', 'song_unique_fulldate_rawcount.p34')
df_unstack.to_pickle(Uniquefull_inter_dir)

In [13]:
# recover only the last 6 months labels
post_6_month = df_unstack.columns.get_level_values(level = 1).unique()[:-6]

In [14]:
# discard months prior to to 6 months
df_unstack = df_unstack.drop(labels=post_6_month, axis = 1, level = 1)

NOTE: There are users who have a membership but did not show any activity

In [15]:
# number of eligible users who have no activity
num_missing_users = len(current_users) - df_unstack.shape[0]
print('Number of missing users =', num_missing_users)

Number of missing users = 123005


#### Convert count to average number of songs per month

In [18]:
# normalize count by number of days
df_unstack = df_unstack.divide(df_unstack.columns.get_level_values(level = 1).daysinmonth)

In [19]:
df_unstack.head()

Unnamed: 0_level_0,num_unq,num_unq,num_unq,num_unq,num_unq,num_unq
date,2016-09-30,2016-10-31,2016-11-30,2016-12-31,2017-01-31,2017-02-28
msno,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0.0,0.0,16.266667,31.580645,22.451613,22.928571
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,23.566667,25.516129,18.166667,21.806452,27.096774,25.785714
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,18.666667,23.935484,32.966667,20.806452,35.096774,19.678571
++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,19.033333,11.225806,8.166667,11.032258,4.193548,6.928571
++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,8.0,7.516129,5.5,8.709677,9.806452,9.321429


In [20]:
# pickle this dataframe
Unique_proc_dir = os.path.join(os.pardir, 'data', 'processed', 'song_unique.p34')
df_unstack.to_pickle(Unique_proc_dir)