### =======================================================================
### IMPORTACIÓN DE DATOS.
### =======================================================================

In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [2]:
training = ''
eventos = ''
dummies = ''
test = ''

In [3]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
#training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN DE LOS DIFERENTES EVENTOS.
eventos = pd.read_csv('events_up_to_01062018.csv', encoding = 'utf-8')
## OBTENEMOS TODA LA INFORMACIÓN A TESTEAR.
#test = pd.read_csv('trocafone_kaggle_test.csv', encoding = 'utf-8')

In [4]:
## PASAMOS LAS COLUMNAS QUE TIENEN UNA CANTIDAD DE VALORES LIMITADA A UN TIPO CATEGORY
eventos['person'] = eventos['person'].astype('category')
eventos['event'] = eventos['event'].astype('category')
eventos['condition'] = eventos['condition'].astype('category')
eventos['storage'] = eventos['storage'].astype('category')
eventos['search_engine'] = eventos['search_engine'].astype('category')
eventos['channel'] = eventos['channel'].astype('category')
eventos['new_vs_returning'] = eventos['new_vs_returning'].astype('category')
eventos['device_type'] = eventos['device_type'].astype('category')
eventos['color'] = eventos['color'].astype('category')
eventos['region'] = eventos['region'].astype('category')
eventos['country'] = eventos['country'].astype('category')
eventos['operating_system_version'] = eventos['operating_system_version'].astype('category')
eventos['city'] = eventos['city'].astype('category')
eventos['browser_version'] = eventos['browser_version'].astype('category')
eventos['screen_resolution'] = eventos['screen_resolution'].astype('category')
eventos['model'] = eventos['model'].astype('category')
eventos['timestamp'] = pd.to_datetime(eventos['timestamp'], infer_datetime_format=True)

### =======================================================================
### ARMADO DE FEATURES.
### =======================================================================

In [5]:
## ORDENAMOS LOS DATOS ṔOR PERSONAS EN PRIMER LUGAR Y TIEMPO EN SEGUNDO.
eventos.sort_values(['person', 'timestamp'], ascending=[True, True], inplace=True)
## DIFERENCIAMOS EN TRES COLUMNAS DIFERENTES EL DIA, MES Y AÑO.
eventos['mes'] = eventos['timestamp'].dt.month
eventos['dia'] = eventos['timestamp'].dt.day
eventos['hora'] = eventos['timestamp'].dt.hour
## ARMAMOS UNA COLUMNA PARA EL DÍA DE LA SEMANA COMO NOMBRE.
eventos['diasemana'] = eventos['timestamp'].dt.weekday_name
## PONEMOS LOS NOMBRES DE MANERA MÁS PROLIJA PARA LOS GRÁFICOS.
eventos.loc[eventos.diasemana.str.contains('Monday', na=False), 'diasemana'] = 'lunes'
eventos.loc[eventos.diasemana.str.contains('Tuesday', na=False), 'diasemana'] = 'martes'
eventos.loc[eventos.diasemana.str.contains('Wednesday', na=False), 'diasemana'] = 'miercoles'
eventos.loc[eventos.diasemana.str.contains('Thursday', na=False), 'diasemana'] = 'jueves'
eventos.loc[eventos.diasemana.str.contains('Friday', na=False), 'diasemana'] = 'viernes'
eventos.loc[eventos.diasemana.str.contains('Saturday', na=False), 'diasemana'] = 'sabado'
eventos.loc[eventos.diasemana.str.contains('Sunday', na=False), 'diasemana'] = 'domingo'
# DEFINIMOS UNA LÓGICA PARA INDICAR SI EL DÍA EN QUE SE EJECUTA EL EVENTO ES FIN DE SEMANA.
eventos['finde'] = 0
eventos.loc[(eventos.diasemana.str.contains('DOM', na=False) | eventos.diasemana.str.contains('SAB', na=False)), 'finde'] = 1
# DEFINIMOS EL MES COMO NOMBRE PARA FACILITAR LAS COLUMNAS
eventos['mesMayus'] = ''
eventos.loc[eventos.mes == 1, 'mesMayus'] = 'enero'
eventos.loc[eventos.mes == 2, 'mesMayus'] = 'febrero'
eventos.loc[eventos.mes == 3, 'mesMayus'] = 'marzo'
eventos.loc[eventos.mes == 4, 'mesMayus'] = 'abril'
eventos.loc[eventos.mes == 5, 'mesMayus'] = 'mayo'
eventos.loc[eventos.mes == 6, 'mesMayus'] = 'junio'
eventos.loc[eventos.mes == 7, 'mesMayus'] = 'julio'
eventos.loc[eventos.mes == 8, 'mesMayus'] = 'agosto'
eventos.loc[eventos.mes == 9, 'mesMayus'] = 'septiembre'
eventos.loc[eventos.mes == 10, 'mesMayus'] = 'octubre'
eventos.loc[eventos.mes == 11, 'mesMayus'] = 'noviembre'
eventos.loc[eventos.mes == 12, 'mesMayus'] = 'diciembre'
# ARMAMOS UNA LÓGICA PARA SEGMENTAR LAS FRANJAS HORARIAS.
# MADRUGADA de 00 a 06
eventos['hora_madrugada'] = 0
eventos.loc[((eventos.hora > -1) & (eventos.hora < 7)), 'hora_madrugada'] = 1
# MAÑANA de 07 a 11
eventos['hora_mañana'] = 0
eventos.loc[((eventos.hora > 6) & (eventos.hora < 12)), 'hora_mañana'] = 1
# ALMUERZO de 12 a 13
eventos['hora_almuerzo'] = 0
eventos.loc[((eventos.hora > 11) & (eventos.hora < 14)), 'hora_almuerzo'] = 1
# TARDE de 14 a 18
eventos['hora_tarde'] = 0
eventos.loc[((eventos.hora > 13) & (eventos.hora < 19)), 'hora_tarde'] = 1
# NOCHE de 19 a 23
eventos['hora_noche'] = 0
eventos.loc[((eventos.hora > 18) & (eventos.hora < 24)), 'hora_noche'] = 1
# TRANSFORMAMOS EN CATEGÓRICAS EL DÍA DE LA SEMANA Y EL MES.
eventos['diasemana'] = eventos['diasemana'].astype('category')
eventos['mesMayus'] = eventos['mesMayus'].astype('category')

In [6]:
# COMO TENEMOS UN EVENTO CON EL MISMO NOMBRE SE GENERA CONFLICTOS, 
# ERGO LE MODIFICAMOS EL NOMBRE PARA NO TENER DOS COLUMNAS CON = NOMBRE Y DISTINTO TIPO.
eventos.rename(columns={'staticpage': 'Genstatpage'}, inplace=True)

In [7]:
# COLUMNAS DONDE POR CADA REGISTRO SABEMOS QUE TENEMOS UN VALOR (SIEMPRE PRESENTES)
dummies = pd.get_dummies(eventos['diasemana'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['mesMayus'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['event'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)

In [8]:
## ESTAS COLUMNAS NO APARECEN SIEMPRE, PUESTO QUE SOLO APARECEN SEGÚN EL TIPO DE EVENTO.
dummies = pd.get_dummies(eventos['color'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['model'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['condition'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)
dummies = pd.get_dummies(eventos['storage'], drop_first=False)
eventos = pd.concat([eventos, dummies], axis=1)

In [9]:
dummies = ''
eventos.rename(columns={'generic listing': 'geneList', 'staticpage': 'statpage', 'staticpage': 'SP'}, inplace=True)

In [10]:
eventos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2341681 entries, 1507286 to 1504503
Columns: 341 entries, timestamp to 8GB
dtypes: category(18), datetime64[ns](1), float64(1), int64(9), object(5), uint8(307)
memory usage: 1.0+ GB


In [11]:
eventos_filtrados = ''
eventos_agrupados = ''
asd = ''

# VOY A TRABAJAR CON UN CASO  TESTIGO.
pd.options.display.max_columns = 350

#eventos = eventos[(eventos.person == 'db2c4d27')]
eventos_filtrados = eventos.iloc[:1000000, 0:150]
eventos_filtrados.head(3)

#eventos_filtrados.info()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,Genstatpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,mes,dia,hora,diasemana,finde,mesMayus,hora_madrugada,hora_mañana,hora_almuerzo,hora_tarde,hora_noche,domingo,jueves,lunes,martes,miercoles,sabado,viernes,abril,enero,febrero,marzo,mayo,ad campaign hit,brand listing,checkout,conversion,geneList,lead,search engine hit,searched products,SP,viewed product,visited site,Amarelo,Ametista,Azul,Azul Escuro,Azul Safira,Azul Topázio,Bambu,Black Piano,Branco,Branco Azul,Branco Azul Navy,Branco Bambu,Branco Cabernet,Branco Dourado,Branco Framboesa,Branco Pink,Branco Verde,Branco Vermelho,Cabernet,Cinza,Cinza espacial,Cobre,Coral,Couro Marrom,Couro Navy,Couro Vinho,Couro Vintage,Cromo,Dourado,Framboesa,Indigo,Iuna,Olympic Edition,Ouro,Ouro Rosa,Platinum,Prata,Prateado,Preto,Preto Asfalto,Preto Azul,Preto Azul Navy,Preto Bambu,Preto Branco,Preto Brilhante,Preto Cabernet,Preto Matte,Preto Pink,Preto Tabaco,Preto Verde,Preto Vermelho,Rosa,Rose,Rouge,Roxo,Silver,Titânio,Turquesa,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix,Asus Live,Asus Zenfone 2,Asus Zenfone 2 Deluxe,Asus Zenfone 2 Laser,"Asus Zenfone 2 Laser 6""",Asus Zenfone 3 Max 32 GB,Asus Zenfone 3 Max 16 GB,Asus Zenfone 5,Asus Zenfone 6,Asus Zenfone Go,Asus Zenfone Selfie,LG X Screen,LG G2 Mini D618,LG G3 Beat D724,LG G3 D855,LG G3 Stylus D690,LG G4 Beat H736,LG G4 H815P,LG G4 H818P,LG G4 Stylus H630,LG G4 Stylus HDTV H540T,LG G5 SE,LG K10,LG K10 Novo,LG K10 TV,LG K4,LG K8,LG L Prime D337,LG L80 Dual,LG Nexus 4
1507286,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,,,,,,,,,,,,,,5,17,12,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2336760,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,,,,,Referral,New,Unknown,Unknown,Brazil,Computer,1920x1080,Windows 10,Chrome 66.0,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1507716,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,,,,,,,,,,,,,,5,17,13,jueves,0,mayo,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
columnas_filtrar = list(eventos_filtrados.select_dtypes(include=['int','float64','uint8']).columns)

eventos_agrupados = ''
asd = ''

columnas_filtrar.remove('sku')
columnas_filtrar.append('person')
asd = eventos_filtrados.loc[:, eventos_filtrados.columns.isin(columnas_filtrar)]
columnas_filtrar.remove('person')
eventos_filtrados = ''
#asd.head(5)
#asd.info()

eventos_agrupados = asd.groupby('person')[columnas_filtrar].mean().astype(np.float16).reset_index()
asd = ''
#eventos_agrupados = eventos_agrupados[(eventos_agrupados.dia > 0)]
#eventos_agrupados.head(15)

#asd.head()
#grouped_df = eventos.groupby('person').mean().reset_index()
#entrenar = pd.](grouped_df)

### =======================================================================
### ENTRENAMIENTO Y PREDICCIÓN.
### =======================================================================

In [13]:
## OBTENEMOS TODA LA INFORMACIÓN DEL SET DE ENTRENAMIENTO.
training = '' 
train_completo = ''
training = pd.read_csv('labels_training_set.csv', encoding = 'utf-8')

In [14]:
#eventos_agrupados.head()
#training.head()
# PROBANDO CON DATOS PONGO 'inner' EN LUGAR DE 'left'
train_completo = pd.merge(eventos_agrupados, training, on='person', how='inner')
train_completo.info()

#training = ''

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19414 entries, 0 to 19413
Columns: 118 entries, person to label
dtypes: float16(116), int64(1), object(1)
memory usage: 4.7+ MB


In [15]:
y = pd.factorize(train_completo['label'])[0]
#columnas_filtrar.remove('person')
#train_completo = train_completo[columnas_filtrar]

#columnas_filtrar.append('person')
train_completo = train_completo.loc[:, train_completo.columns.isin(columnas_filtrar)]
# VER COMO OBTENER LAS COLUMNAS ESPECÍFICAS QUE QUEREMOS TRABAJAR.
features = train_completo.columns
#pd.to_numeric(train_completo, downcast='float')
train_completo.astype(np.float16)
train_completo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19414 entries, 0 to 19413
Columns: 116 entries, domingo to LG Nexus 4
dtypes: float16(116)
memory usage: 4.4 MB


In [20]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
# Creamos un clasificador con Random Forest..
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Entrenamos.train_completo[features].astype(float32)
clf.fit(train_completo[features], y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
# Predecimos.
eventos_agrupados['label'] = 0
clf.predict(eventos_agrupados[features])

In [None]:
# REVISAR. todos dan iguales.
prueba = clf.predict_proba(eventos_agrupados[features])[0:10000]

In [None]:
y = np.unique(prueba)

In [None]:
y

In [None]:
prueba