In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [51]:
# Загружаем преобработанные файлы для train

df_orders_train = pd.read_parquet('data\\train\\df_orders_flat.parquet')
df_visits_train = pd.read_parquet('data\\train\\df_visits_flat.parquet')
df_last_visits_cat_train = pd.read_parquet('data\\train\\df_last_visits_cat_flat.parquet')

In [52]:
# Загружаем преобработанные файлы для val

df_orders_val = pd.read_parquet('data\\val\\df_orders_flat.parquet')
df_visits_val = pd.read_parquet('data\\val\\df_visits_flat.parquet')
df_last_visits_cat_val = pd.read_parquet('data\\val\\df_last_visits_cat_flat.parquet')

In [66]:
# Функции для извлечения фичей из предобработанных файлов

def get_features_from_orders(x):
    features = {}
    features['length_mean'] = x['length'].mean()
    features['order_site-id_count'] = x['site-id'].count()
    features['order_site-id_unique'] = x['site-id'].nunique()
    #features['site-id_mode'] = x['site-id'].mode().tolist()#.to_string(index=False)
    features['id_count'] = x['id'].count()
    features['id_count_unique'] = x['id'].nunique()
    #features['id_mode'] = x['id'].mode().tolist()#.to_string(index=False)
    features['count_sum'] = x['count'].sum()
    #features['brand-id_mode'] = x['brand-id'].mode().tolist()#.to_string(index=False) 
    #features['brand-id_unique'] = x['brand-id'].nunique()
    return pd.Series(features, index=list(features.keys()))

def get_features_from_visits(x):
    features = {}
    features['session-duration_mean'] = x['session-duration'].mean()
    features['pages-count_mean'] = x['pages-count'].mean()
    features['visit_site-id_count'] = x['site-id'].count()
    features['visit_site-id_unique'] = x['site-id'].nunique()
    return pd.Series(features, index=list(features.keys()))

def extract_features(orders_df, visits_df):
    res_df = orders_df.groupby(['user_id', 'target']).apply(get_features_from_orders)
    res_df = res_df.reset_index()
    res_df = res_df.sort_values(by=['user_id'], key=lambda s: s.str[5:].astype(int))
    res_df = res_df.reset_index().drop(['index'], axis=1)
    res_df = res_df.replace('Series([], )', np.nan)
    
    visits_features = visits_df.groupby(['user_id']).apply(get_features_from_visits)
    visits_features = visits_features.reset_index()
    res_df = res_df.merge(visits_features, left_on='user_id', right_on='user_id')
    res_df = res_df.fillna(0)
    
    return res_df

In [67]:
train_df = extract_features(df_orders_train, df_visits_train)
train_df

Unnamed: 0,user_id,target,length_mean,order_site-id_count,order_site-id_unique,id_count,id_count_unique,count_sum,session-duration_mean,pages-count_mean,visit_site-id_count,visit_site-id_unique
0,user_1,female,3.0,3.0,2.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
1,user_2,female,11.0,0.0,0.0,0.0,0.0,0.0,56.000000,1.500000,2.0,1.0
2,user_3,male,16.0,2.0,2.0,1.0,1.0,1.0,195.928571,4.571429,14.0,3.0
3,user_4,male,27.0,9.0,4.0,4.0,4.0,4.0,267.764706,3.176471,17.0,6.0
4,user_5,female,35.0,6.0,3.0,5.0,5.0,5.0,279.391304,5.391304,23.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
119546,user_127751,male,11.0,5.0,2.0,3.0,3.0,3.0,2.500000,1.500000,2.0,2.0
119547,user_127752,male,13.0,3.0,2.0,1.0,1.0,1.0,0.000000,1.000000,1.0,1.0
119548,user_127753,female,24.0,17.0,1.0,17.0,12.0,20.0,106.600000,2.428571,35.0,8.0
119549,user_127754,female,8.0,1.0,1.0,1.0,1.0,0.0,101.000000,2.250000,4.0,2.0


In [68]:
train_df.to_parquet('data\\train_df.parquet')

In [69]:
val_df = extract_features(df_orders_val, df_visits_val)
val_df

Unnamed: 0,user_id,target,length_mean,order_site-id_count,order_site-id_unique,id_count,id_count_unique,count_sum,session-duration_mean,pages-count_mean,visit_site-id_count,visit_site-id_unique
0,user_127756,female,8.0,1.0,1.0,1.0,1.0,0.0,95.833333,1.583333,12.0,2.0
1,user_127757,male,16.0,16.0,4.0,14.0,14.0,16.0,621.000000,4.000000,3.0,3.0
2,user_127758,female,50.0,38.0,5.0,38.0,35.0,67.0,179.884615,4.365385,52.0,15.0
3,user_127759,female,34.0,52.0,3.0,52.0,46.0,52.0,105.314286,2.657143,35.0,9.0
4,user_127760,female,12.0,0.0,0.0,0.0,0.0,0.0,296.000000,18.333333,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
27442,user_155198,female,22.0,9.0,6.0,7.0,6.0,9.0,0.000000,1.000000,2.0,1.0
27443,user_155199,male,7.0,3.0,1.0,0.0,0.0,0.0,15.333333,1.333333,3.0,2.0
27444,user_155200,female,23.0,2.0,1.0,2.0,2.0,2.0,89.307692,1.846154,13.0,5.0
27445,user_155201,male,27.0,3.0,2.0,0.0,0.0,0.0,99.818182,2.272727,22.0,11.0


In [70]:
val_df.to_parquet('data\\val_df.parquet')

In [81]:
val_orders_grouped = val_orders_flat.groupby(['user_id', 'target']).apply(get_features_from_orders)
val_orders_grouped = val_orders_grouped.reset_index()
val_orders_grouped = val_orders_grouped.replace('Series([], )', np.nan)
val_orders_grouped

Unnamed: 0,user_id,target,site-id_count,site-id_unique,site-id_mode,id_count,id_count_unique,id_mode,count_sum,brand-id_mode,brand-id_unique
0,user_127756,female,1,1,[407.0],1,1,[item_676374],0.0,[],0
1,user_127757,male,16,4,[16.0],14,14,"[item_1399130, item_1627033, item_1898950, ite...",16.0,"[1055.0, 1237.0]",6
2,user_127758,female,38,5,[21.0],38,35,"[item_2072631, item_3362928, item_611359]",67.0,[15820.0],21
3,user_127759,female,52,3,[93.0],52,46,"[item_47281, item_47282]",52.0,[3192.0],3
4,user_127760,female,0,0,[],0,0,[],0.0,[],0
...,...,...,...,...,...,...,...,...,...,...,...
27442,user_155198,female,9,6,"[7.0, 8.0, 49.0]",7,6,[item_51490],9.0,[1889758.0],3
27443,user_155199,male,3,1,[21.0],0,0,[],0.0,[],0
27444,user_155200,female,2,1,[34.0],2,2,"[item_1648, item_6173373]",2.0,[708.0],1
27445,user_155201,male,3,2,[218.0],0,0,[],0.0,[],0
