In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../data/events_up_to_01062018.csv'
df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


In [351]:
def colum_per_person(df, columna):
    ''' Agrupa por persona segun el feacture columna
        Recibe el data frame y la columna.
        Deveuelve un nuevo data frame con las columnas ['person', 'feature'], ver que puede haber
        mas de un feature por cada persona. '''
    
    col = df.groupby('person')[columna].value_counts().to_frame()
    col.columns = ['count']
    col.reset_index(inplace=True)
    col.drop('count', axis=1, inplace=True)
    
    return col

In [352]:
# Damos formato a la fecha.
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Agrego columna fecha
df['date'] = df['timestamp'].dt.date
# Agrego columna para los meses.
df['month'] = df['timestamp'].dt.month
# Agrego columna dias
df['day'] = df['timestamp'].dt.weekday_name
# Agrego columna de semanas
df['week'] = df['timestamp'].dt.week

In [353]:
# Personas en el df.
personas = df.groupby('person')['sku'].count()
personas = personas.reset_index()['person'].to_frame()

## Eventos por mes en promedio

In [354]:
df_event = df[['person','event', 'month']]
cv_mes = df_event.groupby(['person', 'month'])['event'].count()
cv_mes = cv_mes.unstack(fill_value=0).mean(axis=1).reset_index()
cv_mes.columns = ['person', 'events_mean']

personas = personas.merge(cv_mes, on='person', how='left').fillna(0)

## Conversiones por mes en promedio

In [355]:
# Conversiones por mes
df_event = df[['person','event', 'month']].loc[df['event'] == 'conversion']
cv_mes = df_event.groupby(['person', 'month'])['event'].value_counts()
cv_mes = cv_mes.unstack(fill_value=0).unstack(fill_value=0)

levels = np.array(cv_mes.columns.levels)
cv_mes.columns = ["{}_in_month_{}".format(n,i) for n in levels[0] for i in range(1,6)]

    #promedio
cv_mes = cv_mes.mean(axis=1).reset_index()
cv_mes.columns = ['person', 'conversion_month_mean']
    
personas = personas.merge(cv_mes, on='person', how='left').fillna(0)

## Checkout por mes en promedio

In [356]:
# Conversiones por mes
df_event = df[['person','event', 'month']].loc[df['event'] == 'checkout']
cout_mes = df_event.groupby(['person', 'month'])['event'].value_counts()
cout_mes = cout_mes.unstack(fill_value=0).unstack(fill_value=0)

levels = np.array(cout_mes.columns.levels)
cout_mes.columns = ["{}_in_month_{}".format(n,i) for n in levels[0] for i in range(1,6)]

#promedio
cout_mes = cout_mes.mean(axis=1).reset_index()
cout_mes.columns = ['person', 'checkout_month_mean']

### Conversion/Checkout por mes en promedio

In [357]:
cout_mes['conv/checkout'] = cv_mes['conversion_month_mean']/(cout_mes['checkout_month_mean']+1)

# merge 
personas = personas.merge(cout_mes[['person','conv/checkout']], on='person', how='left').fillna(0)

##  Country/region/city per Preson

Es nescesario tener en cuenta la region de donde se realizo el evento ya que puede estar influenciado por el estatus social por ejemplo.

In [358]:
# Region
region = colum_per_person(df, 'region')
region = pd.concat([region.drop('region', axis=1), pd.get_dummies(region['region'])], axis=1).fillna(0)

# Uniendo las personas con mismo id.
region = region.groupby('person').sum().reset_index()

region.drop('Unknown', axis=1, inplace=True)
#personas = personas.merge(region, on='person', how='left').fillna(0)

## New vs Returning 

Estaria bueno saber en que tiempo ocurren.

In [359]:
new_vs_returning = colum_per_person(df, 'new_vs_returning')

In [360]:
# Asigno a new como -1 y si regreso a la app 1.
new_vs_returning = new_vs_returning.replace({'New' : 0, 'Returning' : 1})

In [361]:
# Uniendo las personas con mismo id.le
new_vs_ret = new_vs_returning.groupby('person').sum().reset_index()

In [362]:
personas = personas.merge(new_vs_ret, on='person', how='left')

## Features en base de tiempo

## Distancia en dias al ultimo evento 

In [363]:
dist = df[['person','event', 'timestamp']]
dist = dist.groupby('person')['timestamp'].max().to_frame(name='last_event').reset_index()

In [364]:
dist['last_event'] = (max_date - dist['last_event'])
dist['last_event'] = dist['last_event'].astype('timedelta64[D]')
max_day = max(dist['last_event'])

# A mayor distancia menor prob. de conversion.
dist['last_event'] = np.absolute(max_day - dist['last_event'])

In [365]:
personas = personas.merge(dist, on='person', how='left').fillna(0)

## Cantidad de eventos en la ultima semana

In [366]:
import datetime

def previous_week_range(date):
    start_date = date + datetime.timedelta(-date.weekday(), weeks=-1)
    end_date = date + datetime.timedelta(-date.weekday() - 1)
    return start_date, end_date

date_range = previous_week_range(max_date)

# Date range
df_last_week = df.loc[(df['timestamp'] >= date_range[0]) ,['person','event','timestamp']]

df_events_last_week = df_last_week.groupby('person')['event'].value_counts()
df_events_last_week = df_events_last_week.unstack(fill_value=0).sum(axis=1).reset_index()
df_events_last_week.columns = ['person', 'events_cout_last_week']

#merge
personas = personas.merge(df_events_last_week, on='person', how='left').fillna(0)

## Frecuencia promedio en dias entre eventos en la ultima semana

In [367]:
df_last_week['last_event'] = (max_date - df_last_week['timestamp']).astype('timedelta64[D]')
df_events_mean = df_last_week.groupby('person')['last_event'].mean()
df_events_mean = df_events_mean.reset_index()

df_events_mean.columns = ['person', 'events_mean_frec_last_week']

#merge
personas = personas.merge(df_events_mean, on='person', how='left').fillna(0)

## Top evento del usuario

In [368]:
events = df[['person','event','timestamp']].groupby('person')['event'].value_counts()

In [369]:
top_event = events.max(level=0).reset_index()
#personas = personas.merge(top_event, on='person', how='left').fillna(0)

In [370]:
top_events = events.reset_index(level=1, name='count_top_event').reset_index().drop_duplicates(subset='person', keep='first')
top_events = pd.concat([top_events.drop('event', axis=1), pd.get_dummies(top_events['event'], prefix='top')], axis=1)

In [371]:
personas = personas.merge(top_events, on='person', how='left').fillna(0)

## Busqueda de usuario por marca

In [372]:
searched_products = df.loc[df['event'] == 'searched products'][['timestamp','person','skus','search_term']].dropna(axis=0)
searched_products['search_term'] = searched_products['search_term'].str.lower()

marcas = {'Samsung' : 'samsung|sansung|sansumg|s3|s4|s5|s6|s7|s8|s9|a3|a5|a7|a8|a9|a10|j1|j2|j3|\
                        j4|j5|j7|j8|e5|e7|galaxy|note|prime|edge',
            'Apple' : 'iphone|apple|ipad|i phone|aple|iphome|4s|5s|6s|7s|8s',
            'Motorola' : 'moto|g4 plus|g5 plus',
              'LG' : 'lg|k8|k10|k 10|k4',
              'Asus' : 'asus|zonfone|zenfone|azus|assus',
              'Nokia' : 'nokia|lumia',
              'Lenovo' : 'lenovo',
              'Sony' : 'sony|xperia|x-peria',
              'Blackberry': 'blackberry',
              'Quantum' : 'quantum'
         }

for marca, values in marcas.items():
    searched_products.loc[searched_products['search_term'].str.contains(values)
                         , 'searched_model'] = marca

searched_products = colum_per_person(searched_products, 'searched_model')

searched_products = pd.concat([searched_products.drop('searched_model', axis=1), pd.get_dummies(searched_products['searched_model'], prefix='searched_model')], axis=1)
searched_products = searched_products.groupby('person').sum()
personas = personas.merge(searched_products, on='person', how='left').fillna(0)

### Save csv

In [373]:
personas.to_csv('test/data-per-person2.csv', index=False)

In [374]:
personas.head()

Unnamed: 0,person,events_mean,conversion_month_mean,conv/checkout,new_vs_returning,last_event,events_cout_last_week,events_mean_frec_last_week,count_top_event,top_ad campaign hit,...,searched_model_Apple,searched_model_Asus,searched_model_Blackberry,searched_model_LG,searched_model_Lenovo,searched_model_Motorola,searched_model_Nokia,searched_model_Quantum,searched_model_Samsung,searched_model_Sony
0,0008ed71,1.2,0.0,0.125,1.0,136.0,0.0,0.0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00091926,89.6,0.0,0.142857,1.0,150.0,95.0,5.515789,372,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00091a7a,2.0,0.0,0.0,0.0,84.0,0.0,0.0,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000ba417,41.2,0.2,0.090909,1.0,145.0,138.0,6.144928,153,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,000c79fe,3.4,0.0,0.333333,0.0,148.0,17.0,2.0,9,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
