In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime, timedelta
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

### Cargo los datos

In [2]:
events = pd.read_csv("events_up_to_01062018.csv", low_memory=False, dtype={'sku' : 'object'})
training_labels = pd.read_csv("labels_training_set.csv", low_memory=False)
labels_predict = pd.read_csv("trocafone_kaggle_test.csv", low_memory=False)

### Categorizamos 

In [3]:
events['timestamp'] = pd.to_datetime(events['timestamp'], errors = 'coerce', format= '%Y-%m-%d')
events['event'] = pd.Categorical(events['event'])
events['url'] = pd.Categorical(events['url'])
events['model'] = pd.Categorical(events['model'])
events['condition'] = pd.Categorical(events['condition'])
events['storage'] = pd.Categorical(events['storage'])
events['color'] = pd.Categorical(events['color'])
events['staticpage'] = pd.Categorical(events['staticpage'])
events['campaign_source'] = pd.Categorical(events['campaign_source'])
events['search_engine'] = pd.Categorical(events['search_engine'])
events['channel'] = pd.Categorical(events['channel'])
events['new_vs_returning'] = pd.Categorical(events['new_vs_returning'])
events['city'] = pd.Categorical(events['city'])
events['region'] = pd.Categorical(events['region'])
events['country'] = pd.Categorical(events['country'])
events['device_type'] = pd.Categorical(events['device_type'])
events['screen_resolution'] = pd.Categorical(events['screen_resolution'])
events['operating_system_version'] = pd.Categorical(events['operating_system_version'])
events['browser_version'] = pd.Categorical(events['browser_version'])

### Agregamos features

In [4]:
events['day'] = events['timestamp'].dt.day
events['hora'] = events['timestamp'].dt.hour
events['sku'] = events['sku'].fillna(0)
events['sku'] = events['sku'].map(lambda x: float(x))

In [5]:
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,day,hora
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,18,0
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,18,0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,18,0
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,18,0
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,18,0


### Contamos los eventos para cada usuario

In [6]:
events_data = events.pivot_table(index='person', columns='event', values='timestamp', aggfunc='count', fill_value=0)
events_data.columns = events_data.columns.astype('object')
events_data.reset_index(inplace=True)
events_data.head()

event,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
0,0008ed71,0,0,3,0,1,0,0,0,0,0,2
1,00091926,15,25,2,0,0,0,0,0,0,372,34
2,00091a7a,1,5,0,0,0,0,0,0,0,3,1
3,000ba417,1,24,6,1,14,0,1,0,0,153,6
4,000c79fe,1,0,1,0,1,0,1,9,0,3,1


In [7]:
#Agrego feature de promedio de dias
promedios = pd.DataFrame(events.groupby('person')['day'].agg({'promedio dias': 'mean'})).reset_index()
events_data = pd.merge(events_data , promedios, on= 'person', how = 'inner')

In [8]:
#Agrego feature de condcion
condicion = pd.DataFrame(events.groupby('person')['condition'].agg({'condicion': 'count'})).reset_index()
events_data = pd.merge(events_data ,condicion, on= 'person', how = 'inner')

In [9]:
#Agrego feature de momento del dia
promedios_hora = pd.DataFrame(events.groupby('person')['hora'].agg({'promedio hora': 'mean'})).reset_index()
events_data = pd.merge(events_data ,promedios_hora, on= 'person', how = 'inner')

In [10]:
#Agrego feature de sku
promedios_sku = pd.DataFrame(events.groupby('person')['sku'].agg({'promedio sku': 'mean'})).reset_index()
promedios_sku['promedio sku'] = np.log(promedios_sku['promedio sku']) 
events_data = pd.merge(events_data ,promedios_sku, on= 'person', how = 'inner')

In [11]:
events_data.head()

Unnamed: 0,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,promedio dias,condicion,promedio hora,promedio sku
0,0008ed71,0,0,3,0,1,0,0,0,0,0,2,17.0,3,14.333333,8.06694
1,00091926,15,25,2,0,0,0,0,0,0,372,34,16.732143,374,7.006696,8.699917
2,00091a7a,1,5,0,0,0,0,0,0,0,3,1,26.0,3,14.0,7.406286
3,000ba417,1,24,6,1,14,0,1,0,0,153,6,22.262136,160,13.524272,8.536027
4,000c79fe,1,0,1,0,1,0,1,9,0,3,1,29.0,4,0.0,7.757806


### Feature de TFIdf

In [12]:
# Comenzaremos realizando un TF_idf de as busquedas
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [13]:
# Me quedo con los usuarios que tiene busquedas
busquedas = events[events['search_term'].notnull()][['search_term','person']]

# Calculo de Tf_Idf
busquedas_totales = busquedas['search_term'].tolist()
IDF = vectorizer.fit_transform(busquedas_totales)

# Asigno los TFIdf a cada busqueda
resultados = []
for x in range(len(busquedas)):
    resultados.append(IDF[x].sum()) 

busquedas['TfIdf'] = resultados
busquedas = busquedas[['person','TfIdf']]

# Agrupo por persona y calculo el promedio de los TFIdf
busquedas_por_usuario = busquedas.groupby('person')['TfIdf'].agg({'promedio de TFIdf':'mean'}).reset_index()

In [14]:
busquedas_por_usuario.head()

Unnamed: 0,person,promedio de TFIdf
0,000c79fe,1.168325
1,000e619d,1.387051
2,001001be,1.0
3,001802e4,1.277478
4,0019e639,1.163643


In [15]:
events_data = pd.merge(events_data, busquedas_por_usuario, on ='person', how = 'left')
events_data.fillna(0,inplace=True)

### Feature realcioado al canal por el que volvieron a la pagina

In [16]:
# Feature que cuenta la cantidad de veces volvio el usuario y 
# por cual de los channels es que lo hizo
channel_data = events.pivot_table(index='person', columns='channel', 
                                    values='timestamp', aggfunc='count', fill_value=0)
channel_data.columns = channel_data.columns.astype('object')
channel_data.reset_index(inplace=True)
# channel_data = channel_data.applymap(lambda x: (1 if x > 0 else 0) if type(x) == int else x)

In [17]:
events_data = pd.merge(events_data, channel_data, on ='person', how = 'left')
events_data.fillna(0,inplace=True)

In [18]:
#Nose si sirve, no mejoro el score, quedo en 0.75
#Encontro 0    12633
#         1     6782

### Exclusividad del producto comprado

In [19]:
conversion_data = events[events['event'] == 'conversion'][['person','sku']]
conversion_data.drop_duplicates(inplace=True)
conversion_data.head()

Unnamed: 0,person,sku
195,49c19e32,2683.0
724,39df97e0,9358.0
744,380c0e60,6314.0
1198,35ee0cc8,10924.0
2124,c5a0bc36,10855.0


In [20]:
frecuencia_de_skus = conversion_data['sku'].value_counts().to_frame().reset_index()  
frecuencia_de_skus = frecuencia_de_skus.rename(columns={'sku':'frecuencia_sku','index':'sku'})
frecuencia_de_skus['frecuencia_sku'] = frecuencia_de_skus['frecuencia_sku'] / frecuencia_de_skus['frecuencia_sku'].sum() 
frecuencia_de_skus.head()

Unnamed: 0,sku,frecuencia_sku
0,6371.0,0.021772
1,6357.0,0.010698
2,290.0,0.009572
3,3371.0,0.008634
4,6370.0,0.008258


In [21]:
# conversion_data['frecuencia_sku'] = frecuencia_de_skus[frecuencia_de_skus['sku'] == conversion_data['sku']]['frecuencia_sku']
# conversion_data = conversion_data[['person','frecuencia_sku']]
# conversion_data.head()

In [22]:
# events_data = pd.merge(events_data, conversion_data, on ='person', how = 'left')
# events_data['frecuencia_sku'].fillna(0,inplace=True)

### Guardamos 

In [23]:
events_data.to_csv('features.csv', index = False)