In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
path = '../data/events_up_to_01062018.csv'
df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Damos formato a la fecha.
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Agrego columna fecha
df['date'] = df['timestamp'].dt.date
# Agrego columna para los meses.
df['month'] = df['timestamp'].dt.month
# Agrego columna dias
df['day'] = df['timestamp'].dt.weekday_name
# Agrego columna de semanas
df['week'] = df['timestamp'].dt.week
# Distancia en dias al ultimo evento
max_date = max(df['timestamp'])
df['timestamp_days'] = (max_date - df['timestamp']).astype('timedelta64[D]')

In [5]:
# Personas en el set de datos.
personas = df['person'].drop_duplicates().to_frame()

## Eventos

### Cantidad de eventos por persona

In [6]:
events_per_person = df.groupby('person')['event'].count()
events_per_person.columns = ['event_count']
events_per_person = events_per_person.reset_index()

In [7]:
personas = personas.merge(events_per_person, on='person', how='left')

### Cantidad de eventos distintos por persona

In [8]:
events = df[['person', 'event']].groupby('person')['event'].value_counts().unstack().fillna(0)
personas = pd.merge(personas, events, on='person', how= 'left')

### Marca mas vista por el usuario

In [9]:
viewed_product_event = df.loc[df['event'] == 'viewed product', ['person', 'sku', 'model', 'condition', 'storage', 'timestamp']]
viewed_product_event['brand'] = viewed_product_event['model'].str.split(' ').str[0]

In [10]:
events = viewed_product_event.groupby('person')['brand'].value_counts()

top_event = events.max(level=0).reset_index(name='model_categ')
top_event = events.reset_index(level=1, name='count_top_viewed_product').reset_index().drop_duplicates(subset='person', keep='first').replace({0 : np.nan})

In [11]:
top_event.columns = ['person', 'brand_categ', 'count_top_viewed_product']

personas = personas.merge(top_event[['person', 'brand_categ']], on='person', how='left')

### Estado del dispositivo mas visto por el usuario 

In [12]:
events = viewed_product_event.groupby('person')['condition'].value_counts()

top_event = events.max(level=0).reset_index()
top_event = events.reset_index(level=1, name='count_top_viewed_product_cond').reset_index().drop_duplicates(subset='person', keep='first').replace({0 : np.nan})

top_event.columns = ['person', 'top_condition_categ', 'count_top_viewed_product_cond']
personas = personas.merge(top_event[['person', 'top_condition_categ']], on='person', how='left')

### Color del dispositivo mas visto por el usuario 

In [13]:
events = df[['person', 'color']].dropna().groupby('person')['color'].value_counts()
top_event = events.max(level=0).reset_index(name='color_model_categ')
top_event = events.reset_index(level=1, name='count_top_viewed_product').reset_index().drop_duplicates(subset='person', keep='first')
top_event.columns = ['person', 'color_model_categ', 'count_top_viewed_product']
# merge
personas = personas.merge(top_event[['person', 'color_model_categ']], on='person', how='left')

### Top modelo con mas visualizaciones por usuario

In [14]:
viewed_product_event['model_brand'] = viewed_product_event['model'].str.split(' ').str[0] + '' + \
                                                    viewed_product_event['model'].str.split(' ').str[1].fillna('')

events = viewed_product_event.groupby('person')['model_brand'].value_counts()
top_event = events.max(level=0).reset_index(name='model_categ')
top_event = events.reset_index(level=1, name='count_top_viewed_product').reset_index().drop_duplicates(subset='person', keep='first')
top_event.columns = ['person', 'model_brand_views_categ', 'count_top_viewed_product']
# merge
personas = personas.merge(top_event[['person', 'model_brand_views_categ']], on='person', how='left')

### Top evento del usuario

In [15]:
events = df[['person','event']].groupby('person')['event'].value_counts()
top_event = events.reset_index(name='count').drop_duplicates(subset='person', \
                                                             keep='first').rename(columns={'event' : 'top_event_categ'})
#merge
personas = personas.merge(top_event[['person', 'top_event_categ']], on='person', how='left')

### Tipo de dispositivo donde se realizaron los eventos

In [16]:
divice_type = df.loc[df.event.str.contains('visited site'), ['person', 'device_type']]
divice_type = divice_type.groupby(['person']).agg({'device_type': 'first'}).rename(columns={'device_type' : 'device_type_categ'})
personas = personas.merge(divice_type, on='person', how='left')

### Cantidad de veces que el usuario regreso

In [17]:
df['returning'] = np.where((df['new_vs_returning'] == 'Returning'),1,0)

In [18]:
# Uniendo las personas con mismo id.
new_vs_returning = df[['person', 'returning']]
new_vs_ret = new_vs_returning.groupby('person').sum().reset_index()

In [19]:
personas = personas.merge(new_vs_ret, on='person', how='left')

## Features en base de tiempo

### Distancia en dias al ultimo evento 

In [20]:
dist = df[['person', 'timestamp_days']]
dist = dist.groupby('person')['timestamp_days'].min().to_frame(name='last_event_days').reset_index()

In [21]:
personas = personas.merge(dist, on='person', how='left')

### Distancia en dias a cada ultimo evento 

In [22]:
dist_max = df[['person', 'event', 'timestamp_days']]
dist_max = dist_max.groupby(['person', 'event'])['timestamp_days'].min().to_frame(name='first_event_days')\
                                                                                            .unstack().reset_index()
dist_max.columns = ['person'] + [ 'last ' + x[1] + ' days' for x in dist_max.columns][1:]
# merge
personas = personas.merge(dist_max, on='person', how='left')

### Distancia en dias al primer evento 

In [23]:
dist_max = df[['person', 'timestamp_days']]
dist_max = dist_max.groupby('person')['timestamp_days'].max().to_frame(name='first_event_days').reset_index()

In [24]:
personas = personas.merge(dist_max, on='person', how='left')

### Cantidad de eventos en las ultimas dos semanas

In [25]:
start_date = max_date + datetime.timedelta(-max_date.weekday(), weeks=-2)

# Date range
df_last_week = df.loc[(df['timestamp'] >= start_date) ,['person','event','timestamp']]

df_events_last_week = df_last_week.groupby('person')['event'].value_counts()
df_events_last_week = df_events_last_week.unstack(fill_value=0).sum(axis=1).reset_index().fillna(0)
df_events_last_week.columns = ['person', 'events_cout_last_week']

#merge
personas = personas.merge(df_events_last_week, on='person', how='left')
personas['events_cout_last_week'] = personas['events_cout_last_week'].fillna(0)

### Frecuencia promedio en dias entre eventos en las ultimas 2 semanas

In [26]:
df_last_week['last_event'] = (max_date - df_last_week['timestamp']).astype('timedelta64[D]')
df_events_mean = df_last_week.groupby('person')['last_event'].mean()
df_events_mean = df_events_mean.reset_index().fillna(0)

df_events_mean.columns = ['person', 'events_mean_frec_last_week']

#merge
personas = personas.merge(df_events_mean, on='person', how='left')
personas['events_mean_frec_last_week'] = personas['events_mean_frec_last_week'].fillna(0)

## Busqueda de usuario por marca

In [27]:
searched_products = df.loc[df['event'] == 'searched products'][['timestamp','person','skus','search_term']].dropna(axis=0)
searched_products['search_term'] = searched_products['search_term'].str.lower()

marcas = {'Samsung' : 'samsung|sansung|sansumg|s3|s4|s5|s6|s7|s8|s9|a3|a5|a7|a8|a9|a10|j1|j2|j3|\
                        j4|j5|j7|j8|e5|e7|galaxy|note|prime|edge',
            'Apple' : 'iphone|apple|ipad|i phone|aple|iphome|4s|5s|6s|7s|8s',
            'Motorola' : 'moto|g4 plus|g5 plus',
              'LG' : 'lg|k8|k10|k 10|k4',
              'Asus' : 'asus|zonfone|zenfone|azus|assus',
              'Nokia' : 'nokia|lumia',
              'Lenovo' : 'lenovo',
              'Sony' : 'sony|xperia|x-peria',
              'Blackberry': 'blackberry',
              'Quantum' : 'quantum'
         }

for marca, values in marcas.items():
    searched_products.loc[searched_products['search_term'].str.contains(values)
                         , 'searched_model'] = marca


searched_products = searched_products.groupby('person')['searched_model'].value_counts()
top_searched_products = searched_products.max(level=0).reset_index()
top_searched_products = searched_products.reset_index(level=1, name='count_top_searched_model').reset_index().drop_duplicates(subset='person', keep='first')
top_searched_products.columns = ['person','top_searched_model_categ', 'count_top_searched_model']

#merge
personas = personas.merge(top_searched_products[['person','top_searched_model_categ']], on='person', how='left')

## Origen de los eventos

### Region con mayor eventos del usuario 

En el caso de que el usuario tenga la msima cantid de eventos en dos regiones distintas, se toma como prioridad las realizadas posteriormente a las otras.

In [28]:
region_per_person = df.loc[df.event.str.contains('visited site'), ['person','region','timestamp']]
region_per_person.replace({'Unknown' : np.nan}, inplace=True)
region_per_person = region_per_person.sort_values(ascending=False, by='timestamp').groupby('person')['region'].value_counts(sort=False)

In [29]:
region_per_person = region_per_person.groupby(['person']).transform(max) == region_per_person
region_per_person = region_per_person.reset_index(name='top_region_count').rename(columns={'region' : 'region_categ'})
region_per_person = region_per_person.loc[region_per_person['top_region_count']].drop_duplicates(subset='person', keep='first').replace({0 : np.nan})

In [30]:
personas = personas.merge(region_per_person[['person', 'region_categ']], on='person', how='left')

### Ciudad del usuario 

In [31]:
ciudad = df.loc[df.event.str.contains('visited site'),['person', 'city']]
ciudad.replace({'Unknown' : np.nan}, inplace=True)
ciudad = ciudad.groupby('person').agg({'city': 'first'}).rename(columns={'city' : 'city_categ'})
personas = personas.merge(ciudad, on='person', how='left')

### Pais del usuario 

In [32]:
pais = df.loc[df.event.str.contains('visited site'),['person', 'country']]
pais.replace({'Unknown' : np.nan}, inplace=True)
pais = pais.groupby('person').agg({'country': 'first'}).rename(columns={'country' : 'country_categ'})

personas = personas.merge(pais, on='person', how='left')

### Save csv

In [33]:
personas.to_csv('test/data-per-person2.csv', index=False)