In [1]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_parquet('/kaggle/input/train-parquet')

**Hover Duration Features Mean (session_id, event)** 

In [9]:
# Hover duration by session id and event
d1 = data[['session_id','hover_duration','event_name']]

In [13]:
d1 = d1[d1['hover_duration'].notnull()]

In [16]:
hover_mean = d1.groupby(by=['session_id','event_name'])['hover_duration'].mean()

In [45]:
map = hover_mean.xs('map_hover', level='event_name').reset_index()
object = hover_mean.xs('object_hover', level='event_name').reset_index()

In [46]:
map.rename(columns={'hover_duration': 'map_hover_duration'}, inplace=True)
object.rename(columns={'hover_duration': 'object_hover_duration'}, inplace=True)

In [47]:
d1 = pd.merge(map,object,on='session_id')

**Bingo Features**

In [61]:
d2 = data[['session_id','text_fqid','elapsed_time']]
d2 = d2[d2['text_fqid'].notnull()]

In [63]:
d2 = d2[d2['text_fqid'].str.endswith('bingo')]

In [108]:
first_bingo = d2.groupby('session_id').first().reset_index()

In [110]:
first_bingo_elapsed_time = first_bingo[['session_id','elapsed_time']]

In [117]:
first_bingo_elapsed_time.head()

Unnamed: 0,session_id,elapsed_time
0,20090312431273200,346295
1,20090312433251036,453175
2,20090313091715820,467478
3,20090314035813970,756783
4,20090314363702160,376433


In [114]:
d2['time_diff']=d2['elapsed_time'].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d2['time_diff']=d2['elapsed_time'].diff()


In [120]:
d2 = d2[d2['time_diff'] >= 0]

In [129]:
bingo_mean = d2.groupby(by='session_id')['time_diff'].mean().to_frame()

In [132]:
bingo_mean.rename(columns={'time_diff': 'bingo_time_mean'}, inplace=True)

Unnamed: 0_level_0,bingo_time_mean
session_id,Unnamed: 1_level_1
20090312431273200,83243.636364
20090312433251036,192211.470588
20090313091715820,81204.714286
20090314035813970,64043.291667
20090314363702160,347506.363636


In [135]:
d2 = pd.merge(bingo_mean,first_bingo_elapsed_time,on='session_id')

In [270]:
d2.rename(columns={'elapsed_time': 'first_bingo_elapsed_time'}, inplace=True)

In [271]:
d2.head()

Unnamed: 0,session_id,bingo_time_mean,first_bingo_elapsed_time
0,20090312431273200,83243.636364,346295
1,20090312433251036,192211.470588,453175
2,20090313091715820,81204.714286,467478
3,20090314035813970,64043.291667,756783
4,20090314363702160,347506.363636,376433


**Room Change Feature: Ratio of room change per session id**

In [334]:
d3 = data[['session_id','room_fqid']]

In [335]:
d3['shift'] = d3['room_fqid'].shift()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d3['shift'] = d3['room_fqid'].shift()


In [336]:
d3 = d3.groupby('session_id').apply(lambda x: x.iloc[1:]).reset_index(drop=True)

In [340]:
d3 = d3[d3['shift'].notnull()]

In [341]:
d3['room_change']= np.where(d3['room_fqid']==d3['shift'],0,1)

In [342]:
d3_count = d3.groupby(by = 'session_id')['room_change'].sum().to_frame()

In [343]:
d3_total = d3.groupby(by='session_id')['room_change'].count().to_frame()

In [344]:
d3 = pd.merge(d3_count,d3_total,on='session_id')

In [345]:
d3['room_change_ratio']=d3['room_change_x']/d3['room_change_y']

In [346]:
d3.reset_index(inplace = True)

In [347]:
d3 = d3[['session_id','room_change_ratio']]

In [348]:
d3.head()

Unnamed: 0,session_id,room_change_ratio
0,20090312431273200,0.071591
1,20090312433251036,0.062295
2,20090313091715820,0.0625
3,20090314035813970,0.062887
4,20090314363702160,0.076056


**text_fqid change feature: Ratio of text_fqid change per session id**

In [349]:
d4 = data[['session_id','text_fqid']]

In [350]:
d4 = d4[d4['text_fqid'].notnull()]

In [351]:
d4['shift'] = d4['text_fqid'].shift()

In [352]:
d4 = d4.groupby('session_id').apply(lambda x: x.iloc[1:]).reset_index(drop=True)

In [353]:
d4 = d4[d4['shift'].notnull()]

In [354]:
d4['text_change']= np.where(d4['text_fqid']==d4['shift'],0,1)

In [355]:
d4_count = d4.groupby(by = 'session_id')['text_change'].sum().to_frame()
d4_total = d4.groupby(by='session_id')['text_change'].count().to_frame()

In [356]:
d4 = pd.merge(d4_count,d4_total,on='session_id')

In [357]:
d4['text_change_ratio']=d4['text_change_x']/d4['text_change_y']

In [358]:
d4=d4['text_change_ratio'].reset_index()

In [359]:
d4.head()

Unnamed: 0,session_id,text_change_ratio
0,20090312431273200,0.203655
1,20090312433251036,0.207824
2,20090313091715820,0.195822
3,20090314035813970,0.177528
4,20090314363702160,0.153465


Combine All Features

In [360]:
from functools import reduce
dfs = [d2, d3, d4]
# Merging all DataFrames
features = reduce(lambda left, right: pd.merge(left, right, on='session_id', how='inner'), dfs)

In [361]:
features = pd.merge(features,d1,on='session_id',how='left')

In [363]:
features.head()

Unnamed: 0,session_id,bingo_time_mean,first_bingo_elapsed_time,room_change_ratio,text_change_ratio,map_hover_duration,object_hover_duration
0,20090312431273200,83243.636364,346295,0.071591,0.203655,246.259259,1732.763158
1,20090312433251036,192211.470588,453175,0.062295,0.207824,539.812785,1217.883495
2,20090313091715820,81204.714286,467478,0.0625,0.195822,432.461538,2103.177778
3,20090314035813970,64043.291667,756783,0.062887,0.177528,374.594595,4154.083333
4,20090314363702160,347506.363636,376433,0.076056,0.153465,344.84,2490.877551


In [364]:
features.to_csv('features_keyue.csv', index=False)