In [1]:
import os
import pandas as pd
from time import time
from datetime import timedelta

#### Retrieve eligible users for prediction

In [2]:
train_dir = os.path.join(os.pardir, 'data', 'raw', 'train.csv')
s_users = pd.read_csv(train_dir, usecols = ['msno'])

In [3]:
current_users = s_users.msno.values

#### Compute monthly activity

In [4]:
# create file path and compute total number of lines
user_log_dir = os.path.join(os.pardir, 'data', 'raw', 'user_logs.csv')
Total_rows = sum(1 for line in open(user_log_dir))
print('Number of rows = ', Total_rows)

Number of rows =  392106544


In [5]:
# number of rows
# Num_rows = 1e6
Num_rows = 20e6

In [13]:
# USERS LOG IS SORTED BY CUSTOMER ID
# create file iterator
reader_iter = pd.read_csv(user_log_dir, index_col = 'date', parse_dates=['date'], chunksize=Num_rows,\
        usecols = ['msno', 'date', 'total_secs'])
# NOTE: couldn't cast values to int32 to reduce space complexity. The reason being there are extremely high values
# which requires 64 bits. This a floating point too

In [14]:
df_song_tot = pd.DataFrame(index= pd.MultiIndex( levels=[[]]*2, labels = [[]]*2, names=['msno', 'date']), \
                 columns=['total_secs']).astype('float64')
# initialize varaible type to float64 to match data read

In [15]:
# get start time of timer for processing time
start_time = time()
row_counter = 0

for df_chunk in reader_iter:
    
    # keep track of progress
    row_counter += df_chunk.shape[0]
    
    # keep only eligible users
    df_chunk = df_chunk[df_chunk.msno.isin(current_users)]
    
    # discard extreme values
    # we will take the average per month so even if we were to replace those missing values 
    # by the mean, it will not change our results!
    df_chunk = df_chunk[(df_chunk.total_secs > 0) & (df_chunk.total_secs < 24*3600)]

    # group by user and month
    df_song_tot_chunk = df_chunk.groupby(['msno', pd.Grouper(level='date', freq='M')], sort = False).sum()
    
    # add last time serie to global time serie
    df_song_tot = df_song_tot.add(df_song_tot_chunk, fill_value = 0)
    
    # print progress after each processed chunk
    print('\r {:10.1f}% done \t--- {} seconds ellapsed ---'.format(row_counter/Total_rows*100,\
                                timedelta(seconds = time() - start_time)) ,end='', flush=True)

      100.0% done 	--- 0:35:23.197050 seconds ellapsed ---

#### Pivot dates

In [56]:
df_unstack = df_song_tot.unstack(level='date', fill_value=0)

In [57]:
# save all data available (optional)
# Totalfull_inter_dir = os.path.join(os.pardir, 'data', 'interim', 'song_total_fulldate_rawcount.p34')
# df_unstack.to_pickle(Totalfull_inter_dir)

In [58]:
# recover only the last 6 months labels
post_6_month = df_unstack.columns.get_level_values(level = 1).unique()[:-6]

In [59]:
# discard months prior to to 6 months
df_unstack = df_unstack.drop(labels=post_6_month, axis = 1, level = 1)

NOTE: There are users who have a membership but did not show any activity

In [60]:
# number of eligible users who have no activity
num_missing_users = len(current_users) - df_unstack.shape[0]
print('Number of missing users =', num_missing_users)

Number of missing users = 123006


#### Convert count to average number of songs per month

In [61]:
# normalize count by number of days
df_unstack = df_unstack.divide(df_unstack.columns.get_level_values(level = 1).daysinmonth)

In [62]:
# save on memory by converting float64 count to int32
# We don't need fractions of a second precision, casting will floor floating point values
df_unstack = df_unstack.astype('int32')

In [63]:
df_unstack.head()

Unnamed: 0_level_0,total_secs,total_secs,total_secs,total_secs,total_secs,total_secs
date,2016-09-30,2016-10-31,2016-11-30,2016-12-31,2017-01-31,2017-02-28
msno,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,0,4955,8284,5092,5590
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,5780,6152,4444,4999,6390,6750
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,3940,4911,9647,5454,6959,4420
++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,6816,4588,4312,4282,3420,2816
++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,1314,1361,980,1750,1669,1757


In [64]:
# pickle this dataframe
Total_proc_dir = os.path.join(os.pardir, 'data', 'processed', 'song_total.p34')
df_unstack.to_pickle(Total_proc_dir)