In [2]:
#Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Load the data
members = pd.read_csv('members_filtered.csv')
transactions = pd.read_csv('transactions_filtered.csv')
user_logs = pd.read_csv('user_logs_filtered.csv')
labels = pd.read_csv('labels_filtered.csv')

#Set indices
members.set_index('msno', inplace = True)
labels.set_index('msno', inplace = True)

#user_logs.head()

In [4]:
def pd_to_date(df_col):
    df_col = pd.to_datetime(df_col, format = '%Y%m%d')
    return df_col

#Convert to date
user_logs['date'] = pd_to_date(user_logs['date'])
#user_logs.head()

High level plan.  Let's try several things:

* Initially get a table by user (1 row per user)
* User most recent date (max date)
* User first date (min date)
* How long they've been listening:  Min vs. max date by user
* Matrix of all the following (cartesian product)
    * Total X=(seconds, 100, 985, 75, 50, 25, unique), avg per day of X, maybe median per day of X
    * Last day, last 7 days, last 30 days, last 90, 180, 365, total (note last day is relative to user)
    * % change in periods above vs. outside that period.
    * ??? Total change in periods above vs. outside that period.
    * ??? Variation (do they listen consistently, or is it varied)?

In [6]:
#Create our groupby user object 
user_logs_gb = user_logs.groupby(['msno'], sort=False)

In [21]:
#This cell is slow

#Append max date to every row in main table
user_logs['max_date'] = user_logs_gb['date'].transform('max')
user_logs['days_before_max_date'] = (user_logs['max_date'] - user_logs['date']).apply(lambda x: x.days)
    #The .apply(lambda...  just converts it from datetime to an integer, for easier comparisons later.

#Generate user's first date, last date, and tenure
#Also, the user_logs_features table will be the primary table to return from the transactions table
user_logs_features = (user_logs_gb
    .agg({'date':['max', 'min', lambda x: (max(x) - min(x)).days]})  #.days converts to int
    .rename(columns={'max': 'max_date', 'min': 'min_date','<lambda>':'listening_tenure'})
                      )
#Add a 3rd level, used for joining data later
user_logs_features = pd.concat([user_logs_features], axis=1, keys=['date_features'])

In [23]:
# Total X=(seconds, 100, 985, 75, 50, 25, unique), avg per day of X, maybe median per day of X
# Last day, last 7 days, last 30 days, last 90, 180, 365, total (note last day is relative to user)
    
for num_days in [7, 14, 31, 90, 180, 365, 999]:
    #Create groupby object for items with x days
    ul_gb_xdays = (user_logs.loc[(user_logs['days_before_max_date'] < num_days)]
                   .groupby(['msno'], sort=False))

    #Generate sum and mean (and count, once) for all the user logs stats
    past_xdays_by_user = (ul_gb_xdays
        .agg({'num_unq':['sum', 'mean', 'count'],
              'total_secs':['sum', 'mean'],
              'num_25':['sum', 'mean'],
              'num_50':['sum', 'mean'],
              'num_75':['sum', 'mean'],
              'num_985':['sum', 'mean'],
              'num_100':['sum', 'mean'],
             })
                      )
    #Append level header
    past_xdays_by_user = pd.concat([past_xdays_by_user], axis=1, keys=['within_days_' + str(num_days)])

    #Join (append) to user_logs_features table
    user_logs_features = user_logs_features.join(past_xdays_by_user, how='inner')

In [8]:
#Next, let's look at changes in last 7 days vs. last 30 days, and last 30 days vs. last 180 days.

#Also, need to think about users with < x days tenure.

In [81]:
#Save to CSV
#Now skipping this step in favor of writing the full features table to CSV below
#user_logs_features.to_csv('user_logs_features.csv')

In [24]:
#Merge members and labels files
features_all = None
features_all = members.join(labels, how='inner')
features_all = features_all.join(user_logs_features, how='inner')

#Note, the warning is okay, and actually helps us by flattening our column headers.



In [None]:
#Convert days to integers
#user_logs['days_before_max_date'] = (user_logs['max_date'] - user_logs['date']).apply(lambda x: x.days)

In [26]:
#Write all features to csv
features_all.to_csv('features_all.csv')
features_all.to_pickle('features_all.pkl')

High level plan.  Let's try several things:

* Initially get a table by user (1 row per user)
* User most recent date (max date)
* User first date (min date)
* How long they've been listening:  Min vs. max date by user
* Matrix of all the following (cartesian product)
    * Total X=(seconds, 100, 985, 75, 50, 25, unique), avg per day of X, maybe median per day of X
    * Last day, last 7 days, last 30 days, last 90, 180, 365, total (note last day is relative to user)
    * % change in periods above vs. outside that period.
    * ??? Total change in periods above vs. outside that period.
    * ??? Variation (do they listen consistently, or is it varied)?

# Other stuff below.  Delete for final product.

In [None]:
#Sample code, for reference:       
    #Gets # of column levels
        #past_xdays_by_user.columns.nlevels
    
    #This gets only items <= 7 days old
        #user_logs.loc[(user_logs['days_before_max_date'] <= 7)]

    #Get all records from specfic user in past 45 days
        #user_logs.loc[(user_logs['msno'] == 'aof/9XT0zVdONwrq7vd+V4y3saluJQy+Wj0dlo9TKvI=') & (user_logs['days_before_max_date'] < 45)]

    #Just count songs
        #user_logs_gb.agg({'num_unq':{'count','sum'}}).head()
        
    #This code resolves columns with the same name, but shouldn't be used
        #user_logs_features = user_logs_features.join(past_xdays_by_user, lsuffix='_l', rsuffix='_r', how='inner')

    #This code doesn't error but the results aren't as expected 
        # user_logs_gb.apply(lambda x: pd.Series(dict(
        #     song_count=(x.days_before_max_date <= 7).count(),
        #     song_sum=(x.days_before_max_date <= 7).sum()
        # )))
        
#Ideas
    #Maybe get avg. days_before_max_date ... maybe something here.