In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime, timedelta

### Cargo los datos

In [2]:
events = pd.read_csv("events_up_to_01062018.csv", low_memory=False, dtype={'sku' : 'object'})
training_labels = pd.read_csv("labels_training_set.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)

### Categorizamos 

In [3]:
events['timestamp'] = pd.to_datetime(events['timestamp'], errors = 'coerce', format= '%Y-%m-%d')
events['event'] = pd.Categorical(events['event'])
events['url'] = pd.Categorical(events['url'])
events['model'] = pd.Categorical(events['model'])
events['condition'] = pd.Categorical(events['condition'])
events['storage'] = pd.Categorical(events['storage'])
events['color'] = pd.Categorical(events['color'])
events['staticpage'] = pd.Categorical(events['staticpage'])
events['campaign_source'] = pd.Categorical(events['campaign_source'])
events['search_engine'] = pd.Categorical(events['search_engine'])
events['channel'] = pd.Categorical(events['channel'])
events['new_vs_returning'] = pd.Categorical(events['new_vs_returning'])
events['city'] = pd.Categorical(events['city'])
events['region'] = pd.Categorical(events['region'])
events['country'] = pd.Categorical(events['country'])
events['device_type'] = pd.Categorical(events['device_type'])
events['screen_resolution'] = pd.Categorical(events['screen_resolution'])
events['operating_system_version'] = pd.Categorical(events['operating_system_version'])
events['browser_version'] = pd.Categorical(events['browser_version'])

### Agregamos features

In [4]:
events['day'] = events['timestamp'].dt.day
events['hora'] = events['timestamp'].dt.hour
events['sku'] = events['sku'].fillna(0)
events['sku'] = events['sku'].map(lambda x: float(x))

In [5]:
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,day,hora
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,18,0
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,18,0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,18,0
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,18,0
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,18,0


### Contamos los eventos para cada usuario

In [6]:
events_data = events.pivot_table(index='person', columns='event', values='timestamp', aggfunc='count', fill_value=0)
events_data.columns = events_data.columns.astype('object')
events_data.reset_index(inplace=True)
events_data.head()

event,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
0,0008ed71,0,0,3,0,1,0,0,0,0,0,2
1,00091926,15,25,2,0,0,0,0,0,0,372,34
2,00091a7a,1,5,0,0,0,0,0,0,0,3,1
3,000ba417,1,24,6,1,14,0,1,0,0,153,6
4,000c79fe,1,0,1,0,1,0,1,9,0,3,1


In [13]:
#Agrego feature de promedio de dias
promedios = events.groupby('person').agg({'day': 'mean'}).reset_index()
promedios.columns = ['person', 'promedio dias']
events_data = pd.merge(events_data, promedios, on= 'person', how = 'left')

In [14]:
#Agrego feature de condcion
condicion = events.groupby('person').agg({'condition': 'count'}).reset_index()
condicion.columns = ['person', 'condicion']
events_data = pd.merge(events_data, condicion, on= 'person', how = 'left')

In [12]:
#Agrego feature de momento del dia
promedios_hora = events.groupby('person').agg({'hora': 'mean'}).reset_index()
promedios_hora.columns = ['person', 'promedio hora']
events_data = pd.merge(events_data, promedios_hora, on= 'person', how = 'left')

In [17]:
#Agrego feature de sku
promedios_sku = pd.DataFrame(events.groupby('person').agg({'sku': 'mean'})).reset_index()
promedios_sku.columns = ['person', 'promedio sku']
promedios_sku['promedio sku'] = np.log(promedios_sku['promedio sku']) 
events_data = pd.merge(events_data ,promedios_sku, on= 'person', how = 'inner')

  after removing the cwd from sys.path.


In [16]:
#Agrego feature cantidad de retornos
events_retornos = events[events['new_vs_returning'] == 'Returning']
retornos = pd.DataFrame(events.groupby('person').agg({'new_vs_returning': 'count'})).reset_index()
retornos.columns = ['person', 'retornos']
events_data = pd.merge(events_data ,retornos, on= 'person', how = 'inner')

### Info de sesiones

In [21]:
# Tomo como sesión media hora
events = events.sort_values(by='timestamp')
events['timestamp_anterior'] = events.groupby('person')['timestamp'].shift()
events['start_session'] = ((events['timestamp'] - events['timestamp_anterior']) >= timedelta(minutes=30)) | events['timestamp_anterior'].isnull()
events['start_session'] = events['start_session'].astype(int)
events['session_num'] = events.groupby('person')['start_session'].cumsum()
events['session_id'] =  events['person'] + '-' + events['session_num'].astype(str)

In [22]:
session_df = events.groupby(['session_id','person']).agg({'event':'count', 'timestamp':['min','max']})
session_df.columns = session_df.columns.droplevel()
session_df.columns = ['number_events','timestamp_min','timestamp_max']
session_df['total_time'] = ((session_df['timestamp_max']) - session_df['timestamp_min']).astype('timedelta64[s]')
session_df = session_df.reset_index().sort_values(by='session_id')
session_df.head()

Unnamed: 0,session_id,person,number_events,timestamp_min,timestamp_max,total_time
0,0008ed71-1,0008ed71,1,2018-05-17 12:27:47,2018-05-17 12:27:47,0.0
1,0008ed71-2,0008ed71,2,2018-05-17 13:44:59,2018-05-17 13:45:00,1.0
2,0008ed71-3,0008ed71,3,2018-05-17 16:21:54,2018-05-17 16:28:37,403.0
3,00091926-1,00091926,7,2018-05-03 22:08:29,2018-05-03 22:09:59,90.0
4,00091926-10,00091926,4,2018-05-11 02:23:38,2018-05-11 02:23:57,19.0


In [23]:
sessions_by_person = session_df.groupby(['person']).agg({'session_id':'count', 'total_time':['sum','mean']})
sessions_by_person.columns = sessions_by_person.columns.droplevel()
sessions_by_person.columns = ['sessions', 'total_time', 'mean_time_by_session']
sessions_by_person = sessions_by_person.reset_index()
events_data = pd.merge(events_data, sessions_by_person, on='person', how='inner')

### Viewed products events

In [24]:
viewed_events = events[events['event'] == 'viewed product'][['person', 'sku', 'model', 'condition', 'storage', 'color', 'timestamp']]
viewed_events.head()

Unnamed: 0,person,sku,model,condition,storage,color,timestamp
1753200,0f4e2a4b,1397.0,iPhone 6 Plus,Excelente,64GB,Prateado,2018-01-01 08:09:44
1753203,0f4e2a4b,2843.0,iPhone 6 Plus,Bom,64GB,Dourado,2018-01-01 08:45:43
1753204,0f4e2a4b,2841.0,iPhone 6 Plus,Bom,64GB,Prateado,2018-01-01 08:48:57
1753213,0f4e2a4b,2843.0,iPhone 6 Plus,Bom,64GB,Dourado,2018-01-01 08:49:05
1753205,0f4e2a4b,2841.0,iPhone 6 Plus,Bom,64GB,Prateado,2018-01-01 08:49:08


In [25]:
## cuantos skus distintos vió
different_skus_viewed = viewed_events.groupby('person').agg({'sku': pd.Series.nunique}).reset_index()
different_skus_viewed.columns = ['person', 'different skus viewed']
events_data = pd.merge(events_data, different_skus_viewed, on='person', how='left')
events_data['different skus viewed'] = events_data['different skus viewed'].fillna(0)

In [26]:
## marcas que vió
viewed_events['marca'] =viewed_events['model'].str.split(' ').str[0]
viewed_data = viewed_events[['person', 'marca', 'timestamp']].pivot_table(index='person', columns='marca', values='timestamp', aggfunc='count', fill_value=0)              
viewed_data.columns = viewed_data.columns.astype('object')
viewed_data.reset_index(inplace=True)
events_data = pd.merge(events_data, viewed_data, on='person', how='left', suffixes=('','-viewed')).fillna(0)
viewed_data.head()

marca,person,Asus,LG,Lenovo,Motorola,Quantum,Samsung,Sony,iPad,iPhone
0,00091926,0,2,1,55,0,61,1,1,251
1,00091a7a,0,0,0,0,0,0,0,0,3
2,000ba417,0,4,0,35,0,105,1,0,8
3,000c79fe,0,0,0,0,0,0,0,0,3
4,000e4d9e,0,1,0,2,0,300,15,0,21


### Checkout events

In [27]:
checkout_events = events[events['event'] == 'checkout'][['timestamp', 'event', 'person', 'sku', 'model', 'condition',
       'storage', 'color']]
checkout_events.head()

Unnamed: 0,timestamp,event,person,sku,model,condition,storage,color
1753219,2018-01-01 22:35:36,checkout,a66e8424,2710.0,iPhone 5,Bom,16GB,Branco
1753283,2018-01-01 22:35:51,checkout,a66e8424,2710.0,iPhone 5,Bom,16GB,Branco
1753286,2018-01-01 22:42:18,checkout,a66e8424,3647.0,iPhone 5c,Bom,8GB,Branco
1060096,2018-01-01 23:05:55,checkout,a66e8424,2712.0,iPhone 4G,Bom,8GB,Branco
361033,2018-01-02 02:33:13,checkout,25950776,2667.0,Samsung Galaxy Note 4,Muito Bom,32GB,Branco


In [28]:
## cuantos skus distintos checkouteó
different_skus_checkout = checkout_events.groupby('person').agg({'sku': pd.Series.nunique}).reset_index()
different_skus_checkout.columns = ['person', 'different skus checkout']
events_data = pd.merge(events_data, different_skus_checkout, on='person', how='left')
events_data['different skus checkout'] = events_data['different skus checkout'].fillna(0)

In [29]:
## marcas que checkouteó
checkout_events['marca'] =checkout_events['model'].str.split(' ').str[0]
checkout_data = checkout_events[['person', 'marca', 'timestamp']].pivot_table(index='person', columns='marca', values='timestamp', aggfunc='count', fill_value=0)              
checkout_data.columns = checkout_data.columns.astype('object')
checkout_data.reset_index(inplace=True)
events_data = pd.merge(events_data, checkout_data, on='person', how='left', suffixes=('','-checkout')).fillna(0)
checkout_data.head()

marca,person,Asus,LG,Lenovo,Motorola,Quantum,Samsung,Sony,iPad,iPhone
0,0008ed71,0,1,0,0,0,1,0,0,1
1,00091926,0,0,0,0,0,0,0,0,2
2,000ba417,0,0,0,1,0,5,0,0,0
3,000c79fe,0,0,0,0,0,0,0,0,1
4,000e4d9e,0,0,0,0,0,1,0,0,0


### Dispositivos

In [30]:
devices = events[events['device_type'].notnull()][['person', 'device_type', 'timestamp']].pivot_table(index='person', columns='device_type', values='timestamp', aggfunc='count', fill_value=0)              
devices.columns = devices.columns.astype('object')
devices.reset_index(inplace=True)
events_data = pd.merge(events_data, devices, on='person', how='left', suffixes=('','-devices')).fillna(0)
devices.head()

device_type,person,Computer,Smartphone,Tablet,Unknown
0,0008ed71,2,0,0,0
1,00091926,34,0,0,0
2,00091a7a,0,1,0,0
3,000ba417,6,0,0,0
4,000c79fe,0,1,0,0


### Guardamos 

In [18]:
events_data.head()

Unnamed: 0,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,...,condicion_x,promedio hora_x,promedio sku_x,retornos_x,promedio hora_y,promedio dias_y,condicion_y,promedio sku_y,retornos_y,promedio sku
0,0008ed71,0,0,3,0,1,0,0,0,0,...,3,14.333333,8.06694,2,14.333333,17.0,3,8.06694,2,8.06694
1,00091926,15,25,2,0,0,0,0,0,0,...,374,7.006696,8.699917,34,7.006696,16.732143,374,8.699917,34,8.699917
2,00091a7a,1,5,0,0,0,0,0,0,0,...,3,14.0,7.406286,1,14.0,26.0,3,7.406286,1,7.406286
3,000ba417,1,24,6,1,14,0,1,0,0,...,160,13.524272,8.536027,6,13.524272,22.262136,160,8.536027,6,8.536027
4,000c79fe,1,0,1,0,1,0,1,9,0,...,4,0.0,7.757806,1,0.0,29.0,4,7.757806,1,7.757806


In [13]:
events_data.to_csv('features.csv', index = False)