In [1]:
import time
import datetime
import gc
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
def add_count(df, cols, cname):
    df_count = pd.DataFrame(df.groupby(cols)['instance_id'].count()).reset_index()
    df_count.columns = cols + [cname]
    df = df.merge(df_count, on=cols, how='left')
    del df_count
    gc.collect()
    return df

def add_cumcount(df, cols, cname):
    df[cname] = df.groupby(cols).cumcount() + 1
    return df

def add_nunique(df, cols, cname, value):
    df_nunique = pd.DataFrame(df.groupby(cols)[value].nunique()).reset_index()
    df_nunique.columns = cols + [cname]
    df = df.merge(df_nunique, on=cols, how='left')
    del df_nunique
    gc.collect()
    return df

# -----------------------------------------------------------------------------------

# mean, median, max, min, std
def add_mean(df, cols, cname, value):
    df_mean = pd.DataFrame(df.groupby(cols)[value].mean()).reset_index()
    df_mean.columns = cols + [cname]
    df = df.merge(df_mean, on=cols, how='left')
    del df_mean
    gc.collect()
    return df

def add_median(df, cols, cname, value):
    df_median = pd.DataFrame(df.groupby(cols)[value].median()).reset_index()
    df_median.columns = cols + [cname]
    df = df.merge(df_median, on=cols, how='left')
    del df_median
    gc.collect()
    return df

def add_max(df, cols, cname, value):
    df_max = pd.DataFrame(df.groupby(cols)[value].max()).reset_index()
    df_max.columns = cols + [cname]
    df = df.merge(df_max, on=cols, how='left')
    del df_max
    gc.collect()
    return df

def add_min(df, cols, cname, value):
    df_min = pd.DataFrame(df.groupby(cols)[value].min()).reset_index()
    df_min.columns = cols + [cname]
    df = df.merge(df_min, on=cols, how='left')
    del df_min
    gc.collect()
    return df

def add_std(df, cols, cname, value):
    df_std = pd.DataFrame(df.groupby(cols)[value].std()).reset_index()
    df_std.columns = cols + [cname]
    df = df.merge(df_std, on=cols, how='left')
    del df_std
    gc.collect()
    return df

# -------------------------------------------------------------------------------------

# conversion rate features? 

# -------------------------------------------------------------------------------------
# find out active time for a certain user
# is_midnight: 0, is_morning: 1, is_afternoon: 2, is_night: 3
def add_active_time(df):
    df_tmp1 = pd.DataFrame(df.groupby('user_id')['is_midnight'].sum()).reset_index()
    df_tmp2 = pd.DataFrame(df.groupby('user_id')['is_morning'].sum()).reset_index()
    df_tmp3 = pd.DataFrame(df.groupby('user_id')['is_afternoon'].sum()).reset_index()
    df_tmp4 = pd.DataFrame(df.groupby('user_id')['is_night'].sum()).reset_index()
    df_tmp = df_tmp1.merge(df_tmp2, on='user_id', how='left')
    df_tmp = df_tmp.merge(df_tmp3, on='user_id', how='left')
    df_tmp = df_tmp.merge(df_tmp4, on='user_id', how='left')
    df_tmp.rename(columns={'is_midnight':0},inplace=True)
    df_tmp.rename(columns={'is_morning':1},inplace=True)
    df_tmp.rename(columns={'is_afternoon':2},inplace=True)
    df_tmp.rename(columns={'is_night':3},inplace=True)
    df_tmp['active_time'] = df_tmp[[0,1,2,3]].idxmax(axis=1)
    df = df.merge(df_tmp,on = 'user_id', how = 'left')
    df.drop([0,1,2,3],axis=1,inplace=True)
    df.drop(['is_midnight','is_morning', 'is_afternoon', 'is_night'],axis=1,inplace=True)
    return df


In [3]:
train = pd.read_csv('data/complete_train.csv')
test = pd.read_csv('data/test.csv')
print(Counter(train['day']).keys())
print(Counter(test['day']).keys())

dict_keys([17, 18, 20, 21, 19, 22, 23, 24])
dict_keys([25, 24])


In [4]:
# sort data according to day and time
train = train.sort_values(by = ['day','time']).reset_index().iloc[:, 1:]

In [5]:
# count features
count_features = {
    # user_id related
    'user_count':['user_id'], 
    'user_shop_count': ['user_id', 'shop_id'], 
    'user_item_count':['user_id','item_id'], 
    'user_shop_item_count':['user_id','shop_id','item_id'], 
    #----------------------------------------------------------
    'user_itembrand_count':['user_id', 'item_brand_id'], 
    'user_itemcity_count': ['user_id','item_city_id'], 
    'user_page_count': ['user_id', 'context_page_id'],
    # item_id related
    'item_useroccupation_count':['item_id','user_occupation_id'],
    'item_page_count':['item_id','context_page_id']
    }

# cumulative count features
cumcount_features = {
    # user_id related
    'user_cumcount':['user_id'], 
    'user_shop_cumcount': ['user_id', 'shop_id'], 
    'user_item_cumcount':['user_id','item_id'], 
    'user_shop_item_cumcount':['user_id','shop_id','item_id']
    }

# unique count features
unicount_features = {
    # user_id related
    'user_item_unicount':[['user_id'],'item_id'],
    'user_shop_unicount':[['user_id'], 'shop_id'],
    # shop_id related
    'shop_item_unicount':[['shop_id'],'item_id']
}

# statistical features
stat_features = {
    # user_id related
    'user_item_price':[['user_id'], 'item_price_level'],
    'user_item_sales':[['user_id'], 'item_sales_level'],
    'user_item_pv':[['user_id'], 'item_pv_level'],
    # ------------------------------------------------------------
    'user_shop_review_num':[['user_id'], 'shop_review_num_level'],
    'user_shop_star':[['user_id'], 'shop_star_level'],
    'user_shop_service':[['user_id'], 'shop_score_service'],
    'user_shop_delivery':[['user_id'], 'shop_score_delivery'],
    'user_shop_description':[['user_id'], 'shop_score_description'],
    # item_id related
    'item_user_age':[['item_id'],'user_age_level'],
    'item_user_star':[['item_id'],'user_star_level'],
    # shop_id related
    'shop_item_price':[['shop_id'],'item_price_level'],
    'shop_user_age':[['shop_id'],'user_age_level'],
    'shop_user_star':[['shop_id'],'user_star_level']
}

# ratio features
ratio_features = {
    # since all of family prop is less than 0.05, and most of them are 0, we just ignore it.
    'shop_user_gender_ratio':[['shop_id'], 'user_gender_id'],
    'item_user_gender_ratio':[['item_id'], 'user_gender_id']
}


In [6]:
def feature_selection(df):
    # add active_time
    df = add_active_time(df)
    # add count features
    for (k,v) in count_features.items():
        df = add_count(df,cols = v, cname = k)
    # add cumulative count features
    for (k,v) in cumcount_features.items():
        df = add_cumcount(df,cols = v, cname = k)
    # add unique count features
    for (k,v) in unicount_features.items():
        df = add_nunique(df, cols = v[0], cname = k, value = v[1])
    # statistical features  
    for (k,v) in stat_features.items():
        df = add_mean(df, cols = v[0], cname = k+'_mean', value = v[1])
        #df = add_median(df, cols = v[0], cname = k+'_median', value = v[1])
        #df = add_max(df, cols = v[0], cname = k+'_max', value = v[1])
        #df = add_min(df, cols = v[0], cname = k+'_min', value = v[1])
        df = add_std(df, cols = v[0], cname = k+'_std', value = v[1])
        
    # ratio features
    df_shop_gender_ratio = df.groupby(['shop_id'])['user_gender_id']\
                            .agg([lambda x: np.mean(x == 0)])\
                            .reset_index()\
                            .rename(columns={'<lambda>': 'shop_user_gender_ratio'})
    
    df_item_gender_ratio = train.groupby(['item_id'])['user_gender_id']\
                            .agg([lambda x: np.mean(x == 0)])\
                            .reset_index()\
                            .rename(columns={'<lambda>': 'item_user_gender_ratio'})
    df = df.merge(df_shop_gender_ratio, on='shop_id', how='left')
    df = df.merge(df_item_gender_ratio, on='item_id', how='left')
    
    df['item_collect_pv_ratio'] = df.item_collected_level/df.item_pv_level
    df.drop(['item_collected_level'],axis=1,inplace=True) 
    df.drop(['item_property_list'],axis=1,inplace=True)
    return df
    

In [7]:
train = feature_selection(train)

In [8]:
train.to_csv('data/processed_train.csv',index=None)

In [9]:
test = feature_selection(test)
test.to_csv('data/processed_test.csv',index = None)

In [11]:
train.columns

Index(['instance_id', 'item_id', 'item_brand_id', 'item_city_id',
       'item_price_level', 'item_sales_level', 'item_pv_level', 'user_id',
       'user_gender_id', 'user_age_level', 'user_occupation_id',
       'user_star_level', 'context_id', 'context_timestamp', 'context_page_id',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade', 'datetime', 'day', 'hour', 'time',
       'item_category', 'cate_precision', 'cate_recall', 'prop_precision',
       'prop_recall', 'active_time', 'user_count', 'user_shop_count',
       'user_item_count', 'user_shop_item_count', 'user_itembrand_count',
       'user_itemcity_count', 'user_page_count', 'item_useroccupation_count',
       'item_page_count', 'user_cumcount', 'user_shop_cumcount',
       'user_item_cumcount', 'user_shop_item_cumcount', 'user_item_unicount',
       'user_shop_unicount', 'shop_item_unicount', 