# **Importamos librerias de interes**

In [None]:
import random

from joblib import dump, load

import numpy as np

import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from sklearn import svm

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

import keras

import tensorflow as tf

import matplotlib.pyplot as plt

from scipy import interpolate

from google.colab import drive

# **Funciones de trabajo**

In [None]:
# Definimos las funciones de transformacion

#_______________________________________________________________________________
def fun_action_time(df_data):

  action_time=df_data['action_time']

  action_time=df_data['action_time'].values.reshape(-1,1)

  return action_time

#_______________________________________________________________________________
def fun_delay_time(df_data):

  up_time_displaced=pd.concat(
                              objs=[  pd.Series([0]) ,   df_data['up_time']  ],
                              ignore_index=True
                                                            )

  delay_time=(df_data['down_time'].values -
                              up_time_displaced.iloc[0:-1].values
                                                                ).reshape(-1,1)

  return delay_time

#_______________________________________________________________________________
def fun_activity(df_data, aux_dict):

  map_dict_activity=aux_dict['map_dict_activity']['map_dict']

  vocab_activity=aux_dict['map_dict_activity']['vocab']

  layer_activity= tf.keras.layers.StringLookup(
                                      vocabulary=vocab_activity,
                                      num_oov_indices=1,
                                      oov_token='[UNK]', # mapea a 0
                                      output_mode='int')

  activity_cut=df_data['activity'].apply(lambda x: x[:3])

  cat_activity=pd.Series(layer_activity(activity_cut)).apply(lambda x:
                                    map_dict_activity[x]).values.reshape(-1,1)

  return cat_activity

#_______________________________________________________________________________

# funcion auxiliar 1
def mod_act(str_arr):
  if str_arr=='NoChange':
    return 0
  else:
    if ' => ' in str_arr:
      str_split=str_arr.split(' => ', maxsplit=1)
      return len(str_split[0])+len(str_split[1])
    else:
      return len(str_arr)

# funcion auxiliar 2
def cut_act(len_act):
  if len_act>=2:
    return 2
  else:
    return len_act

def fun_mod_activity(df_data, aux_dict):

  map_dict_mod_activity=aux_dict['map_dict_mod_activity']['map_dict']

  vocab_mod_activity=aux_dict['map_dict_mod_activity']['vocab']

  layer_mod_activity=tf.keras.layers.IntegerLookup(
                                  vocabulary=vocab_mod_activity,
                                  num_oov_indices=1, # lo mapea a 0
                                  output_mode='int',
                                  vocabulary_dtype='int64')


  activity_mod=df_data['text_change'].apply(mod_act).apply(cut_act)

  cat_mod_activity=pd.Series(layer_mod_activity(activity_mod)).apply(
                        lambda x: map_dict_mod_activity[x]).values.reshape(-1,1)

  return cat_mod_activity

#_______________________________________________________________________________

def fun_event(df_data, aux_dict):


  map_dict_event=aux_dict['map_dict_event']['map_dict']

  vocab_event=aux_dict['map_dict_event']['vocab']

  layer_event= tf.keras.layers.StringLookup(vocabulary=vocab_event,
                                      num_oov_indices=1,
                                     output_mode='int')

  event=pd.Series(layer_event(df_data['up_event'])).apply(
                              lambda x: map_dict_event[x]).values.reshape(-1,1)

  return event

#_______________________________________________________________________________

def fun_cursor_position(df_data):

  cursor_position=df_data['cursor_position'].values

  x_cursor_position=np.arange(len(cursor_position)).reshape(-1,1)

  # definimos el svr

  regr_cp = make_pipeline(StandardScaler(), svm.SVR(kernel='linear',C=100
                                                            , epsilon=10e-21))

  regr_cp.fit(x_cursor_position,
              cursor_position)

  cursor_position_lin=regr_cp.predict(x_cursor_position)

  cursor_position_flat=(cursor_position_lin-cursor_position).reshape(-1,1)

  return cursor_position_flat

#_______________________________________________________________________________

def fun_word_count(df_data):

  std_scaler=StandardScaler()

  word_count=df_data['word_count'].values

  x_word_count=np.arange(len(word_count)).reshape(-1,1)

  regr_wc = make_pipeline(StandardScaler(), svm.SVR(kernel='linear',C=100,
                                                    epsilon=10e-21))

  regr_wc.fit(
    x_word_count,
    word_count)

  word_count_lin=regr_wc.predict(x_word_count)

  word_count_flat=(word_count_lin-word_count).reshape(-1,1)

  # hacemos la interpolacion

  index_interp=np.floor(np.linspace(start=x_word_count[0],
                                      stop=x_word_count[-1] ,
                                          num=20)).astype(int)

  x_intp=np.squeeze(x_word_count[index_interp])

  y=np.squeeze(word_count_flat[index_interp])

  f = interpolate.interp1d(x_intp, y,'nearest-up')

  word_count_flat_intp=f(x_word_count).reshape(-1,1)

  return word_count_flat_intp


#_______________________________________________________________________________


# funcion auxiliar

def text_change_format(row):

  if ' => ' in row:
    return '=>'
  else:
    return row

def fun_text_change(df_data, aux_dict):


  map_dict_tc=aux_dict['map_dict_tc']['map_dict']



  vocab_tc=aux_dict['map_dict_tc']['vocab']


  layer_tc= tf.keras.layers.StringLookup(vocabulary=vocab_tc,
                                            num_oov_indices=1,
                                                  output_mode='int')



  tc_data=pd.Series(
          layer_tc(
              df_data['text_change'].apply(text_change_format)
                                        )
                                            ).apply(lambda x:
                                            map_dict_tc[x]).values.reshape(-1,1)


  return tc_data

#_______________________________________________________________________________

def fun_len_time(df_data):

  max_len_time=8313000

  max_ts_len_time=np.maximum(df_data['up_time'].max(),
                             df_data['down_time'].max())

  return max_ts_len_time/max_len_time



#_______________________________________________________________________________

def fun_len_cursor_position(df_data):

  max_cursor_position=7800

  max_ts_cursor_position=df_data['cursor_position'].max()

  return max_ts_cursor_position/max_cursor_position



#_______________________________________________________________________________

def fun_len_word(df_data):

  max_len_word=1330

  max_ts_len_word=df_data['word_count'].max()

  return max_ts_len_word/max_len_word


In [None]:

#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

def train_data_processing( df_logs, df_scores, id_list):

  # definimos algunas cosas

  df_train=df_logs[df_logs['id'].isin(id_list)]

  df_target=df_scores[df_scores['id'].isin(id_list)]

  aux_dict={}

  # vamos generando los diccionarios necesarios

#-------------------------------------------------------------------------------

  activity_cut=df_train['activity'].apply(lambda x: x[:3])

  vocab_activity=list(activity_cut.value_counts(normalize=True).index.values)

  # definimos la capa

  layer_activity= tf.keras.layers.StringLookup(
                                      vocabulary=vocab_activity,
                                      num_oov_indices=1,
                                      oov_token='[UNK]', # mapea a 0
                                      output_mode='int')



  # extraemos los indices

  index=pd.Series(layer_activity(activity_cut)).value_counts(
                                                normalize=True).index.values

  # la aplicacion de la funcion

  fun_map=pd.Series(layer_activity(activity_cut)).value_counts(
                                                normalize=True).apply(lambda x:
                                                                     np.log(1/x)
                                                                      ).cumsum()
  # creamos el diccionario

  map_dict_activity={}

  for element in zip(index,fun_map):
    map_dict_activity[element[0]]=element[1]

  # agregamos la llave de OOV

  if 0 not in map_dict_activity:
    max_value= max(map_dict_activity.values())
    map_dict_activity[0]=2*max_value


  # guardamos

  aux_dict['map_dict_activity']={'map_dict':map_dict_activity,
                                 'vocab':vocab_activity}

#-------------------------------------------------------------------------------

  vocab_mod_activity=list(df_train['text_change'].apply(mod_act).apply(cut_act)
                                     .value_counts(normalize=True).index.values)

  layer_mod_activity=tf.keras.layers.IntegerLookup(
                                  vocabulary=vocab_mod_activity,
                                  num_oov_indices=1, # lo mapea a 0
                                  output_mode='int',
                                  vocabulary_dtype='int64')

  index=pd.Series(layer_mod_activity(df_train['text_change']
                                     .apply(mod_act)
                                     .apply(cut_act)
                                                              )
                                    ).value_counts(normalize=True).index.values

  fun_map=pd.Series(layer_mod_activity(df_train['text_change']
                                     .apply(mod_act)
                                     .apply(cut_act)
                                                              )
                                    ).value_counts(
                           normalize=True).apply(lambda x: np.log(1/x)).cumsum()

  map_dict_mod_activity={}

  for element in zip(index,fun_map):
    map_dict_mod_activity[element[0]]=element[1]


  # agregamos la llave de OOV

  if 0 not in map_dict_mod_activity:
    max_value= max(map_dict_mod_activity.values())
    map_dict_activity[0]=2*max_value


  aux_dict['map_dict_mod_activity']={'map_dict':map_dict_mod_activity,
                                      'vocab':vocab_mod_activity}

#-------------------------------------------------------------------------------

  df_event=pd.concat(objs=[df_train['down_event'],
                           df_train['up_event']
                                                    ],axis=0,ignore_index=True)

  vocab_event=list(df_event.value_counts(normalize=True).index.values)


  if len(vocab_event)>15:
    vocab_event=vocab_event[:15]


  layer_event= tf.keras.layers.StringLookup(vocabulary=vocab_event,
                                      num_oov_indices=1,
                                      output_mode='int')

  # creamos el diccionario

  index=pd.Series(layer_event(df_train['up_event'])).value_counts(
                                                    normalize=True).index.values

  fun_map=pd.Series(layer_event(df_train['up_event'])).value_counts(
                                                    normalize=True).apply(
                                                 lambda x: np.log(1/x)).cumsum()


  map_dict_event={}

  for element in zip(index,fun_map):
      map_dict_event[element[0]]=element[1]



  if 0 not in map_dict_event:
    max_value= max(map_dict_event.values())
    map_dict_event[0]=2*max_value




  aux_dict['map_dict_event']={'map_dict':map_dict_event,
                              'vocab':vocab_event}


#-------------------------------------------------------------------------------

  tc_transf=df_train['text_change'].apply(text_change_format)

  vocab_tc=list(tc_transf.value_counts(normalize=True).index.values)

  if len(vocab_tc)>12:
    vocab_tc=vocab_tc[:12]

  layer_tc= tf.keras.layers.StringLookup(vocabulary=vocab_tc,
                                            num_oov_indices=1,
                                                  output_mode='int')


  index=pd.Series(layer_tc(df_train['text_change'].apply(
                text_change_format))).value_counts(normalize=True).index.values

  fun_map=pd.Series(layer_tc(df_train['text_change'].apply(
        text_change_format))).value_counts(normalize=True).apply(
                                                 lambda x: np.log(1/x)).cumsum()

  # creamos el dic

  map_dict_tc={}

  for element in zip(index,fun_map):
    map_dict_tc[element[0]]=element[1]

  if 0 not in map_dict_tc:
    max_value= max(map_dict_tc.values())
    map_dict_tc[0]=2*max_value



  aux_dict['map_dict_tc']={'map_dict':map_dict_tc,
                           'vocab':vocab_tc }

#-------------------------------------------------------------------------------
  # construimos el diccionario de data

  dict_train={}

  for id in id_list:
    df_instance=df_train[df_train['id']==id]

  # concatenamos la data

    dyn_data=np.hstack(
                      tup=(

    fun_action_time(df_instance),
    fun_delay_time(df_instance),
    fun_activity(df_instance, aux_dict),
    fun_mod_activity(df_instance, aux_dict),
    fun_event(df_instance, aux_dict),
    fun_cursor_position(df_instance),
    fun_word_count(df_instance),
    fun_text_change(df_instance, aux_dict)
                                    )
                                                  )

    stat_data=np.hstack(

                          tup=(
    fun_len_time(df_instance),
    fun_len_cursor_position(df_instance),
    fun_len_word(df_instance)

                                        )

                              )

    target=df_target[df_target['id']==id]['score'].values

    dict_train[id]={

        'dyn_data':dyn_data,
        'stat_data':stat_data,
        'target':target
                                }



#-------------------------------------------------------------------------------
  # hacemos el scaling y pading

  list_scaler=[]

  for k in range(8):

    list_seq=[]
    for id in dict_train:
      seq=list(dict_train[id]['dyn_data'][:,k])
      list_seq=list_seq+seq

    list_scaler.append(
                      MinMaxScaler(feature_range=(0, 5))
                      .fit(np.array(list_seq).reshape(-1,1))
                                                              )

  # aplicamos el scaler para cada secuencia en cada columna de dyn data

  for id in dict_train:

    dyn_data=dict_train[id]['dyn_data']

    dict_train[id]['dyn_data']=np.hstack(tup=tuple(

                  [ list_scaler[k].transform(dyn_data[:,k].reshape(-1,1))
                                                            for k in range(8) ]

                                              )
                                                  )

  # aplicamos pading sobre la data
  # determianmos la longitud maxima de las secuencias

  max_len=int(df_train['event_id'].max()*1.05)


  aux_dict['max_len']=max_len

  list_data=[]

  for column in range(8):

    list_seq=[]

    for id in dict_train:
      list_seq.append(dict_train[id]['dyn_data'][:,column])

    list_data.append(list_seq)


  # aplicamos padding a cada columna y juntamos las columns
  pad_list_data=[tf.keras.utils.pad_sequences(
                                            sequences=column,
                                              maxlen=max_len,
                                              dtype='float64',
                                              padding='post',
                                              truncating='pre',
                                              value=-1

                                                ).reshape(len(dict_train),-1,1)
                                                    for column in list_data
                                                              ]



  dyn_train_data=np.concatenate(tuple(pad_list_data),axis=2)
  # preparamos la data estatica

  stat_train_data=np.vstack(

  tup=tuple([dict_train[id]['stat_data'] for id in dict_train])
                                                                )

  # preparamos los targets

  target_train_data=df_target['score'].values.reshape(-1,1)

  return  dyn_train_data, stat_train_data, target_train_data, aux_dict


In [None]:
#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

def test_data_processing( df_logs, df_scores, id_list,aux_dict):

  # definimos algunas cosas

  df_test=df_logs[df_logs['id'].isin(id_list)]

  df_target=df_scores[df_scores['id'].isin(id_list)]

#-------------------------------------------------------------------------------
  # construimos el diccionario de data

  dict_test={}

  for id in id_list:
    df_instance=df_test[df_test['id']==id]

  # concatenamos la data

    dyn_data=np.hstack(
                      tup=(

    fun_action_time(df_instance),
    fun_delay_time(df_instance),
    fun_activity(df_instance, aux_dict),
    fun_mod_activity(df_instance, aux_dict),
    fun_event(df_instance, aux_dict),
    fun_cursor_position(df_instance),
    fun_word_count(df_instance),
    fun_text_change(df_instance, aux_dict)
                                    )
                                                  )

    stat_data=np.hstack(

                          tup=(
    fun_len_time(df_instance),
    fun_len_cursor_position(df_instance),
    fun_len_word(df_instance)

                                        )

                              )

    target=df_target[df_target['id']==id]['score'].values

    dict_test[id]={

        'dyn_data':dyn_data,
        'stat_data':stat_data,
        'target':target
                                }



#-------------------------------------------------------------------------------
  # hacemos el scaling y pading

  list_scaler=[]

  for k in range(8):

    list_seq=[]
    for id in dict_test:
      seq=list(dict_test[id]['dyn_data'][:,k])
      list_seq=list_seq+seq

    list_scaler.append(
                      MinMaxScaler(feature_range=(0, 5))
                      .fit(np.array(list_seq).reshape(-1,1))
                                                              )

  # aplicamos el scaler para cada secuencia en cada columna de dyn data

  for id in dict_test:

    dyn_data=dict_test[id]['dyn_data']

    dict_test[id]['dyn_data']=np.hstack(tup=tuple(

                  [ list_scaler[k].transform(dyn_data[:,k].reshape(-1,1))
                                                            for k in range(8) ]

                                              )
                                                  )

  # aplicamos pading sobre la data
  # determianmos la longitud maxima de las secuencias

  max_len=aux_dict['max_len']

  list_data=[]

  for column in range(8):

    list_seq=[]

    for id in dict_test:
      list_seq.append(dict_test[id]['dyn_data'][:,column])

    list_data.append(list_seq)


  # aplicamos padding a cada columna y juntamos las columns
  pad_list_data=[tf.keras.utils.pad_sequences(
                                            sequences=column,
                                              maxlen=max_len,
                                              dtype='float64',
                                              padding='post',
                                              truncating='pre',
                                              value=-1

                                                ).reshape(len(dict_test),-1,1)
                                                       for column in list_data
                                                              ]


  dyn_test_data=np.concatenate(tuple(pad_list_data),axis=2)

  # preparamos la data estatica

  stat_test_data=np.vstack(

  tup=tuple([dict_test[id]['stat_data'] for id in dict_test])
                                                                )

  # preparamos los targets

  target_test_data=df_target['score'].values.reshape(-1,1)

  return  dyn_test_data, stat_test_data, target_test_data

# **importamos la data**

In [None]:
# conectamos al drive
drive.mount('/content/drive')

# Definimos el directorio
dir_data_kaggle='./drive/MyDrive/lwpwq/data'

# extraemos la data

df_logs=pd.read_csv(filepath_or_buffer=dir_data_kaggle+'/train_logs.csv')

df_scores=pd.read_csv(filepath_or_buffer=dir_data_kaggle+'/train_scores.csv')

Mounted at /content/drive


# **Definimos el split**

In [None]:
sample_id=list(df_logs['id'].unique())

train_size=int(len(df_logs['id'].unique())*0.9)

id_list_train= random.sample(sample_id, k=train_size)

complement_sample_id=[id for id in sample_id if id not in id_list_train]

id_list_test= complement_sample_id

# **Pre procesamos la data**

In [None]:


dyn_train_data, stat_train_data, target_train_data, aux_dict= train_data_processing(df_logs,
                                                                                    df_scores,
                                                                                    id_list_train)


dyn_test_data, stat_test_data, target_test_data=test_data_processing( df_logs,
                                                                     df_scores,
                                                                  id_list_test,
                                                                      aux_dict)

# **Exportamos la data**

In [None]:
np.save(dir_data_kaggle+'/dyn_train_data_prod.npy', dyn_train_data)
np.save(dir_data_kaggle+'/dyn_test_data_prod.npy', dyn_test_data)


np.save(dir_data_kaggle+'/stat_train_data_prod.npy', stat_train_data)
np.save(dir_data_kaggle+'/stat_test_data_prod.npy', stat_test_data)


np.save(dir_data_kaggle+'/target_train_data_prod.npy', target_train_data)
np.save(dir_data_kaggle+'/target_test_data_prod.npy', target_test_data)

dump(aux_dict, dir_data_kaggle+'/prod_dict.joblib')

['./drive/MyDrive/lwpwq/data/prod_dict.joblib']