In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../data/events_up_to_01062018.csv'
df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Damos formato a la fecha.
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Agrego columna fecha
df['date'] = df['timestamp'].dt.date
# Agrego columna para los meses.
df['month'] = df['timestamp'].dt.month
# Agrego columna dias
df['day'] = df['timestamp'].dt.weekday_name
# Agrego columna de semanas
df['week'] = df['timestamp'].dt.week
# Distancia en dias al ultimo evento
max_date = max(df['timestamp'])
df['timestamp_days'] = (max_date - df['timestamp']).astype('timedelta64[D]')

In [5]:
# Personas en el set de datos.
personas = df['person'].drop_duplicates().to_frame()
category_values = []

In [6]:
personas.count()

person    38829
dtype: int64

## Eventos

### Cantidad de eventos por persona

In [7]:
events_per_person = df.groupby('person')['event'].count()
events_per_person.columns = ['event_count']
events_per_person = events_per_person.reset_index()

In [8]:
personas = personas.merge(events_per_person, on='person', how='left')

### Cantidad de eventos distintos por persona

In [9]:
events = df[['person', 'event']].groupby('person')['event'].value_counts().unstack().fillna(0)
personas = pd.merge(personas, events, on='person', how= 'left')

### Modelo mas visto por el usuario

In [10]:
viewed_product_event = df.loc[df['event'] == 'viewed product', ['person', 'sku', 'model', 'condition', 'storage', 'timestamp']]

In [11]:
events = viewed_product_event.groupby('person')['model'].value_counts()

top_event = events.max(level=0).reset_index()
top_event = events.reset_index(level=1, name='count_top_viewed_product').reset_index().drop_duplicates(subset='person', keep='first')

In [12]:
personas = personas.merge(top_event, on='person', how='left')

In [13]:
top_event.head()

Unnamed: 0,person,model,count_top_viewed_product
0,00091926,iPhone 6S,94
36,00091a7a,iPhone 6,1
39,000ba417,Samsung Galaxy A3 Duos,28
65,000c79fe,iPhone 7,3
66,000e4d9e,Samsung Galaxy S6 Flat,139


### Estado del dispositivo mas visto por el usuario >

In [14]:
events = viewed_product_event.groupby('person')['condition'].value_counts()

top_event = events.max(level=0).reset_index()
top_event = events.reset_index(level=1, name='count_top_viewed_product_cond').reset_index().drop_duplicates(subset='person', keep='first')
personas = personas.merge(top_event, on='person', how='left')

## divice_type

In [17]:
#Features: Region del usuario
divice_type = df.loc[df.event.str.contains('visited site'), ['person', 'device_type']]
divice_type = divice_type.groupby(['person']).agg({'device_type': 'first'})
personas = personas.merge(divice_type, on='person', how='left')

## Cantidad de veces que el usuario regreso

In [20]:
df['returning'] = np.where((df['new_vs_returning'] == 'Returning'),1,0)

In [21]:
# Uniendo las personas con mismo id.
new_vs_returning = df[['person', 'returning']]
new_vs_ret = new_vs_returning.groupby('person').sum().reset_index()

In [22]:
personas = personas.merge(new_vs_ret, on='person', how='left')

## Features en base de tiempo

## Distancia en dias al ultimo evento 

In [23]:
dist = df[['person', 'timestamp_days']]
dist = dist.groupby('person')['timestamp_days'].min().to_frame(name='last_event_days').reset_index()

In [24]:
personas = personas.merge(dist, on='person', how='left')
personas['last_event_days'] = personas['last_event_days'].fillna(personas['last_event_days'].mean())

## Distancia al primer evento >

In [25]:
dist_max = df[['person', 'timestamp_days']]
dist_max = dist_max.groupby('person')['timestamp_days'].max().to_frame(name='first_event_days').reset_index()

In [26]:
#personas = personas.merge(dist_max, on='person', how='left')
#personas['first_event_days'] = personas['first_event_days'].fillna(personas['first_event_days'].mean())

## Cantidad de eventos en la ultima semana

In [27]:
import datetime

def previous_week_range(date):
    start_date = date + datetime.timedelta(-date.weekday(), weeks=-1)
    end_date = date + datetime.timedelta(-date.weekday() - 1)
    return start_date, end_date

date_range = previous_week_range(max_date)

# Date range
df_last_week = df.loc[(df['timestamp'] >= date_range[0]) ,['person','event','timestamp']]

df_events_last_week = df_last_week.groupby('person')['event'].value_counts()
df_events_last_week = df_events_last_week.unstack(fill_value=0).sum(axis=1).reset_index()
df_events_last_week.columns = ['person', 'events_cout_last_week']

#merge
personas = personas.merge(df_events_last_week, on='person', how='left').fillna(0)

## Frecuencia promedio en dias entre eventos en la ultima semana

In [28]:
df_last_week['last_event'] = (max_date - df_last_week['timestamp']).astype('timedelta64[D]')
df_events_mean = df_last_week.groupby('person')['last_event'].mean()
df_events_mean = df_events_mean.reset_index()

df_events_mean.columns = ['person', 'events_mean_frec_last_week']

#merge
personas = personas.merge(df_events_mean, on='person', how='left').fillna(0)

## Top evento del usuario

In [29]:
events = df[['person','event','timestamp']].groupby('person')['event'].value_counts()

In [30]:
top_event = events.max(level=0).reset_index()

In [31]:
#personas = personas.merge(top_event, on='person', how='left')

## Busqueda de usuario por marca

In [32]:
searched_products = df.loc[df['event'] == 'searched products'][['timestamp','person','skus','search_term']].dropna(axis=0)
searched_products['search_term'] = searched_products['search_term'].str.lower()

marcas = {'Samsung' : 'samsung|sansung|sansumg|s3|s4|s5|s6|s7|s8|s9|a3|a5|a7|a8|a9|a10|j1|j2|j3|\
                        j4|j5|j7|j8|e5|e7|galaxy|note|prime|edge',
            'Apple' : 'iphone|apple|ipad|i phone|aple|iphome|4s|5s|6s|7s|8s',
            'Motorola' : 'moto|g4 plus|g5 plus',
              'LG' : 'lg|k8|k10|k 10|k4',
              'Asus' : 'asus|zonfone|zenfone|azus|assus',
              'Nokia' : 'nokia|lumia',
              'Lenovo' : 'lenovo',
              'Sony' : 'sony|xperia|x-peria',
              'Blackberry': 'blackberry',
              'Quantum' : 'quantum'
         }

for marca, values in marcas.items():
    searched_products.loc[searched_products['search_term'].str.contains(values)
                         , 'searched_model'] = marca


searched_products = searched_products.groupby('person')['searched_model'].value_counts()
top_searched_products = searched_products.max(level=0).reset_index()
top_searched_products = searched_products.reset_index(level=1, name='count_top_searched_model').reset_index().drop_duplicates(subset='person', keep='first')

#merge
personas = personas.merge(top_searched_products, on='person', how='left')

## Region

### Region de ultimo evento >

In [33]:
region_per_person = df.loc[df.event.str.contains('visited site'), ['person','region','timestamp']]
region_per_person.replace({'Unknown' : np.nan}, inplace=True)
region_per_person = region_per_person.sort_values(ascending=False, by='timestamp').drop_duplicates(subset='person', keep='first')

#personas = personas.merge(region_per_person, on='person', how='left')

### Region con mayor eventos del usuario >

En el caso de que el usuario tenga la msima cantid de eventos en dos regiones distintas, se toma como prioridad las realizadas posteriormente a las otras.

In [34]:
region_per_person = df.loc[df.event.str.contains('visited site'), ['person','region','timestamp']]
region_per_person.replace({'Unknown' : np.nan}, inplace=True)
region_per_person = region_per_person.sort_values(ascending=False, by='timestamp').groupby('person')['region'].value_counts(sort=False)

In [35]:
region_per_person = region_per_person.groupby(['person']).transform(max) == region_per_person
region_per_person = region_per_person.reset_index(name='top_region_count')
region_per_person = region_per_person.loc[region_per_person['top_region_count']].drop_duplicates(subset='person', keep='first')

In [36]:
region_per_person.head()

Unnamed: 0,person,region,top_region_count
0,00091926,Rio Grande do Sul,True
1,00091a7a,Minas Gerais,True
2,000ba417,Sao Paulo,True
3,000c79fe,Minas Gerais,True
4,000e4d9e,Minas Gerais,True


In [37]:
personas = personas.merge(region_per_person[['person', 'region']], on='person', how='left')

### Ciudad del usuario >

In [38]:
ciudad = df.loc[df.event.str.contains('visited site'),['person', 'city']]
ciudad.replace({'Unknown' : np.nan}, inplace=True)
ciudad = ciudad.groupby('person').agg({'city': 'first'})
personas = personas.merge(ciudad, on='person', how='left')

### Pais del usuario >

In [39]:
pais = df.loc[df.event.str.contains('visited site'),['person', 'country']]
pais.replace({'Unknown' : np.nan}, inplace=True)
pais = pais.groupby('person').agg({'country': 'first'})
personas = personas.merge(pais, on='person', how='left')

### Save csv

In [40]:
personas.to_csv('test/data-per-person2.csv', index=False)