In [6]:
import pandas as pd
import numpy as np

In [8]:
path = '../data/events_up_to_01062018.csv'
df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


In [81]:
def colum_per_person(df, columna):
    ''' Agrupa por persona segun el feacture columna
        Recibe el data frame y la columna.
        Deveuelve un nuevo data frame con las columnas ['person', 'feature'], ver que puede haber
        mas de un feature por cada persona. '''
    
    col = df.groupby('person')[columna].value_counts().to_frame()
    col.columns = ['count']
    col.reset_index(inplace=True)
    col.drop('count', axis=1, inplace=True)
    
    return col

In [82]:
# Damos formato a la fecha.
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Agrego columna fecha
df['date'] = df['timestamp'].dt.date
# Agrego columna para los meses.
df['month'] = df['timestamp'].dt.month
# Agrego columna dias
df['day'] = df['timestamp'].dt.weekday_name
# Agrego columna de semanas
df['week'] = df['timestamp'].dt.week
# Day
max_date = max(df['timestamp'])
df['timestamp_days'] = (max_date - df['timestamp']).astype('timedelta64[D]')

In [83]:
# Personas en el set de datos.
personas = df['person'].drop_duplicates().to_frame()

In [84]:
personas.count()

person    38829
dtype: int64

## Eventos

### Cantidad de eventos por persona

In [85]:
events_per_person = df.groupby('person')['event'].count()
events_per_person.columns = ['event_count']
events_per_person = events_per_person.reset_index()

In [86]:
personas = personas.merge(events_per_person, on='person', how='left')

### Modelo mas visto por el usuario

In [87]:
viewed_product_event = df.loc[df['event'] == 'viewed product', ['person', 'sku', 'model', 'condition', 'storage', 'timestamp']]

In [88]:
events = viewed_product_event.groupby('person')['model'].value_counts()

top_event = events.max(level=0).reset_index()
top_event = events.reset_index(level=1, name='count_top_viewed_product').reset_index().drop_duplicates(subset='person', keep='first')

In [89]:
personas = personas.merge(top_event, on='person', how='left')

### Estado del dispositivo mas visto por el usuario >

In [90]:
events = viewed_product_event.groupby('person')['condition'].value_counts()

top_event = events.max(level=0).reset_index()
top_event = events.reset_index(level=1, name='count_top_viewed_product_cond').reset_index().drop_duplicates(subset='person', keep='first')
#personas = personas.merge(top_event, on='person', how='left')

## New vs Returning 

In [91]:
new_vs_returning = colum_per_person(df, 'new_vs_returning')

In [92]:
# Asigno a new como -1 y si regreso a la app 1.
new_vs_returning = new_vs_returning.replace({'New' : 0, 'Returning' : 1})

In [93]:
# Uniendo las personas con mismo id.le
new_vs_ret = new_vs_returning.groupby('person').sum().reset_index()

In [94]:
personas = personas.merge(new_vs_ret, on='person', how='left')

## Features en base de tiempo

## Distancia en dias al ultimo evento 

In [95]:
dist = df[['person', 'timestamp_days']]
dist = dist.groupby('person')['timestamp_days'].min().to_frame(name='last_event_days').reset_index()

In [96]:
personas = personas.merge(dist, on='person', how='left')
personas['last_event_days'] = personas['last_event_days'].fillna(personas['last_event_days'].mean())

## Distancia al primer evento >

In [97]:
dist_max = df[['person', 'timestamp_days']]
dist_max = dist_max.groupby('person')['timestamp_days'].max().to_frame(name='first_event_days').reset_index()

In [98]:
personas = personas.merge(dist_max, on='person', how='left')
personas['first_event_days'] = personas['first_event_days'].fillna(personas['first_event_days'].mean())

## Cantidad de eventos en la ultima semana

In [99]:
import datetime

def previous_week_range(date):
    start_date = date + datetime.timedelta(-date.weekday(), weeks=-1)
    end_date = date + datetime.timedelta(-date.weekday() - 1)
    return start_date, end_date

date_range = previous_week_range(max_date)

# Date range
df_last_week = df.loc[(df['timestamp'] >= date_range[0]) ,['person','event','timestamp']]

df_events_last_week = df_last_week.groupby('person')['event'].value_counts()
df_events_last_week = df_events_last_week.unstack(fill_value=0).sum(axis=1).reset_index()
df_events_last_week.columns = ['person', 'events_cout_last_week']

#merge
personas = personas.merge(df_events_last_week, on='person', how='left').fillna(0)

## Frecuencia promedio en dias entre eventos en la ultima semana

In [100]:
df_last_week['last_event'] = (max_date - df_last_week['timestamp']).astype('timedelta64[D]')
df_events_mean = df_last_week.groupby('person')['last_event'].mean()
df_events_mean = df_events_mean.reset_index()

df_events_mean.columns = ['person', 'events_mean_frec_last_week']

#merge
personas = personas.merge(df_events_mean, on='person', how='left').fillna(0)

## Top evento del usuario

In [101]:
events = df[['person','event','timestamp']].groupby('person')['event'].value_counts()

In [102]:
top_event = events.max(level=0).reset_index()

In [103]:
top_event.head()

Unnamed: 0,person,event
0,0008ed71,3
1,00091926,372
2,00091a7a,5
3,000ba417,153
4,000c79fe,9


## Busqueda de usuario por marca

In [104]:
searched_products = df.loc[df['event'] == 'searched products'][['timestamp','person','skus','search_term']].dropna(axis=0)
searched_products['search_term'] = searched_products['search_term'].str.lower()

marcas = {'Samsung' : 'samsung|sansung|sansumg|s3|s4|s5|s6|s7|s8|s9|a3|a5|a7|a8|a9|a10|j1|j2|j3|\
                        j4|j5|j7|j8|e5|e7|galaxy|note|prime|edge',
            'Apple' : 'iphone|apple|ipad|i phone|aple|iphome|4s|5s|6s|7s|8s',
            'Motorola' : 'moto|g4 plus|g5 plus',
              'LG' : 'lg|k8|k10|k 10|k4',
              'Asus' : 'asus|zonfone|zenfone|azus|assus',
              'Nokia' : 'nokia|lumia',
              'Lenovo' : 'lenovo',
              'Sony' : 'sony|xperia|x-peria',
              'Blackberry': 'blackberry',
              'Quantum' : 'quantum'
         }

for marca, values in marcas.items():
    searched_products.loc[searched_products['search_term'].str.contains(values)
                         , 'searched_model'] = marca


searched_products = searched_products.groupby('person')['searched_model'].value_counts()
top_searched_products = searched_products.max(level=0).reset_index()
top_searched_products = searched_products.reset_index(level=1, name='count_top_searched_model').reset_index().drop_duplicates(subset='person', keep='first')

#merge
personas = personas.merge(top_searched_products, on='person', how='left')

## Region

### Save csv

In [105]:
personas.to_csv('test/data-per-person2.csv', index=False)

In [106]:
personas.head()

Unnamed: 0,person,event,model,count_top_viewed_product,new_vs_returning,last_event_days,first_event_days,events_cout_last_week,events_mean_frec_last_week,searched_model,count_top_searched_model
0,4886f805,9,Samsung Galaxy J7 Prime,4.0,0.0,13.0,13.0,0.0,0.0,Samsung,1.0
1,ad93850f,65,iPhone 5s,17.0,1.0,9.0,17.0,7.0,9.0,,
2,0297fc1e,567,iPhone 6,213.0,1.0,3.0,142.0,78.0,6.769231,Apple,5.0
3,2d681dd8,26,iPhone 7,6.0,1.0,4.0,13.0,16.0,4.0,Motorola,1.0
4,cccea85e,836,Motorola Moto G4 Plus,252.0,1.0,0.0,24.0,270.0,1.540741,LG,1.0
