# Create features based on user's activity log

In [1]:
import os
import re
import pandas as pd
import numpy as np
import psycopg2 as pg2
from time import time
from datetime import timedelta

In [2]:
# query
query_tot_agg = '''
                SELECT msno, date_trunc('month', date) AS month_year,
                SUM(total_secs) AS total_secs, COUNT(total_secs) AS total_secs_count,
                SUM(num_25) AS num_25,
                SUM(num_50) AS num_50,
                SUM(num_75) AS num_75,
                SUM(num_985) AS num_985,
                SUM(num_100) AS num_100,
                SUM(num_unq) AS num_unq,
                COUNT(*) AS total_count
                FROM Activity
                WHERE date_trunc('month', date) >= '2016-07-01'
                GROUP BY msno, month_year
                ORDER BY msno, month_year
                '''

In [3]:
start_time = time()
# connect to database and retrieve aggregation
with pg2.connect(dbname = 'kkbox', user='postgres', password = 'Hallmark') as conn:
    df_activity = pd.read_sql_query(query_tot_agg, conn)

print('{}'.format(timedelta(seconds = time() - start_time))

datetime.timedelta(0, 2230, 43613)

In [4]:
df_activity.shape

(9026403, 11)

In [5]:
df_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9026403 entries, 0 to 9026402
Data columns (total 11 columns):
msno                object
month_year          object
total_secs          float64
total_secs_count    int64
num_25              int64
num_50              int64
num_75              int64
num_985             int64
num_100             int64
num_unq             int64
total_count         int64
dtypes: float64(1), int64(8), object(2)
memory usage: 757.5+ MB


In [6]:
df_activity.head()

Unnamed: 0,msno,month_year,total_secs,total_secs_count,num_25,num_50,num_75,num_985,num_100,num_unq,total_count
0,////SJZL1C7NrCXGlhnW3lm7vuKDbfB+ALl99xDbSlE=,2016-07-01 00:00:00-04:00,36.808,2,2,0,0,0,0,2,2
1,///2pAPKetZe8zPqAwkYBjAUr+4pS8Rc6bsO4eGAlWI=,2016-07-01 00:00:00-04:00,292876.0,31,128,45,45,92,982,825,31
2,///2pAPKetZe8zPqAwkYBjAUr+4pS8Rc6bsO4eGAlWI=,2016-08-01 00:00:00-04:00,212593.0,31,88,56,42,70,708,766,31
3,///2pAPKetZe8zPqAwkYBjAUr+4pS8Rc6bsO4eGAlWI=,2016-09-01 00:00:00-04:00,370245.0,30,314,59,60,76,1283,1478,30
4,///2pAPKetZe8zPqAwkYBjAUr+4pS8Rc6bsO4eGAlWI=,2016-10-01 00:00:00-04:00,300341.0,31,68,46,34,28,1080,1016,31


#### Retrieve eligible users for prediction

In [7]:
train_dir = os.path.join(os.pardir, 'data', 'processed', 'train.csv')
s_users = pd.read_csv(train_dir, usecols = ['msno'])

In [8]:
# keep data from eligible users
df_activity = df_activity[df_activity.msno.isin(s_users.msno)]

In [9]:
df_activity.shape

(5694375, 11)

Convert month_year to datetime object and eliminate time

In [10]:
df_activity.month_year = pd.to_datetime(df_activity.month_year, utc=True)

In [11]:
df_activity.month_year = df_activity.month_year.values.astype('<M8[D]')

Set hierarchical indexes

In [12]:
df_activity = df_activity.set_index(['msno', 'month_year'])

Create cumulative sum of song count

In [13]:
# capture song count column (intervals, not num_unq)
num_column = [col for col in df_activity.columns if re.match('num_\d\d+', col)]

In [14]:
for sl in range(2,len(num_column)+1):
    
    # get the number of song length to accumulate
    cum_length = num_column[:sl]
    
    # create new level name
    last_num_digits = cum_length[-1].split('_')[-1]
    cum_name = ''.join(['cum_', last_num_digits])
    
    # sum along columns
    df_activity[cum_name] = df_activity.loc[:, cum_length].sum(axis = 1)

Create fraction of unique songs

In [15]:
df_activity['frac_unq'] = df_activity.num_unq / df_activity.cum_100

Normalize song count by number of activity days, same for total listening time

In [16]:
column2norm = [col for col in df_activity.columns if col[:3] in ['num', 'cum']]
# normalize song count
for col in column2norm:
    df_activity.loc[:, col] = df_activity.loc[:, col] / df_activity.total_count

In [17]:
# normalize listening time
df_activity.total_secs = df_activity.total_secs/df_activity.total_secs_count

Drop total columns

In [18]:
df_activity.drop(['total_secs_count', 'total_count'], axis = 1, inplace=True)

Calculate percentage change from one month to another

In [19]:
# faster to use shift -1 rather than pct_change()
df_percent = df_activity / df_activity.groupby(level='msno', sort = False).shift() - 1

In [20]:
# rename columns
df_percent.columns = ['_'.join([col, 'pct']) for col in df_percent.columns]
# we don't have a percentage change for the first month, fill it with 0
df_percent = df_percent.fillna(0)
# when the previous activity is 0 and current activity is not, it will yield an infinite increase!
# set inf to 1 instead
df_percent = df_percent.replace(to_replace = np.inf, value = 1)

Combine percentage increase with raw count

In [21]:
df_activity = pd.concat([df_activity, df_percent], axis = 1)

Unstack to create feature for each month. When data was not available for that month, replace by 0

In [22]:
df_activity = df_activity.unstack(fill_value=0)

# TODO: Check if total_secs columns have Nan after unstacking

In [23]:
Activity_proc_dir = os.path.join(os.pardir, 'data', 'processed', 'activity_features.p34')
df_activity.to_pickle(Activity_proc_dir)