---

## 6. Integración de texto

In [None]:
import sys
import os
project_path = os.path.abspath('..')
sys.path.insert(1, project_path)

import time
import math
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split

from src.features.preprocessing import preprocessing, custom_features, boc_some_values, boc_many_values

### Idea principal

- Tener un pipeline básico para cada una de las tareas
- Dejar pre-procesamiento
- Compatibilidad con output de modelo de lenguaje
- Elegir mejor manera de incluir modelo de lenguaje

In [6]:
pipe = Pipeline([
    ('Pre-procesamiento',preprocessing),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [39]:
df_train = pd.read_pickle('train.pickle')
df_train = custom_features(df_train)
X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['rating'], test_size=0.3, random_state=0, stratify=df_train['rating'])

In [8]:
def timeSince(since):
    now = time.time_ns()
    s = now - since
    return s*10**(-9)

In [9]:
print("Resultados clasificación MLP")
start = time.time_ns()
pipe.fit(X_train, y_train)

print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe.predict(X_eval)

print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))
print(classification_report(y_eval,y_pred))

Resultados clasificación MLP
Time elapsed for training: 6.620332166000001 seconds

Time elapsed for inference (eval set): 0.043536191 seconds

                 precision    recall  f1-score   support

          Mixed       0.31      0.28      0.29       497
Mostly Positive       0.28      0.29      0.28       512
       Negative       0.46      0.33      0.38       387
       Positive       0.33      0.42      0.37       610
  Very Positive       0.41      0.36      0.38       359

       accuracy                           0.34      2365
      macro avg       0.36      0.34      0.34      2365
   weighted avg       0.35      0.34      0.34      2365



### Agregando embeddings

In [38]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

#### Versión logits

In [41]:
def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class LogitsEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: logits_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [13]:
preprocesisng_logits = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',LogitsEmbedding(),'short_description')
])

pipe_logits = Pipeline([
    ('Pre-procesamiento',preprocesisng_logits),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [14]:
print("Resultados clasificación MLP con logit embeddings\n")

start = time.time_ns()
pipe_logits.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_logits.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print(classification_report(y_eval,y_pred))

Resultados clasificación MLP con logit embeddings

Time elapsed for training: 131.765849573 seconds

Time elapsed for inference (eval set): 51.816425677000005 seconds

                 precision    recall  f1-score   support

          Mixed       0.27      0.37      0.31       497
Mostly Positive       0.26      0.21      0.23       512
       Negative       0.45      0.35      0.39       387
       Positive       0.35      0.41      0.38       610
  Very Positive       0.42      0.32      0.36       359

       accuracy                           0.33      2365
      macro avg       0.35      0.33      0.33      2365
   weighted avg       0.34      0.33      0.33      2365



#### Versión token [CLF]

In [15]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token contextualizado
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

class CLFTokenEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: first_tok_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [16]:
preprocesisng_CLFToken = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',CLFTokenEmbedding(),'short_description')
])

pipe_CLFToken = Pipeline([
    ('Pre-procesamiento',preprocesisng_CLFToken),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [17]:
print("Resultados clasificación MLP con CLFToken embeddings\n")

start = time.time_ns()
pipe_CLFToken.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_CLFToken.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

Resultados clasificación MLP con CLFToken embeddings

Time elapsed for training: 141.993939402 seconds

Time elapsed for inference (eval set): 56.33335626 seconds



3 min approx entre fit y predict

In [18]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.29      0.31      0.30       497
Mostly Positive       0.23      0.08      0.12       512
       Negative       0.36      0.37      0.37       387
       Positive       0.32      0.52      0.40       610
  Very Positive       0.38      0.27      0.32       359

       accuracy                           0.32      2365
      macro avg       0.32      0.31      0.30      2365
   weighted avg       0.31      0.32      0.30      2365



#### Versión promedio de embeddings

In [19]:
def mean_embedding(cfl_output):
    # retorna un numpy array correspondiente a la suma de los vectores contextualizados
    return cfl_output['hidden_states'][-1][0].detach().numpy().mean(axis=0).reshape(1,768)

class MeanEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: mean_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [20]:
preprocesisng_mean = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',MeanEmbedding(),'short_description')
])

pipe_mean = Pipeline([
    ('Pre-procesamiento',preprocesisng_mean),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [21]:
print("Resultados clasificación MLP con mean embeddings\n")

start = time.time_ns()
pipe_mean.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_mean.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

Resultados clasificación MLP con mean embeddings

Time elapsed for training: 139.731334318 seconds

Time elapsed for inference (eval set): 59.623988174000004 seconds



In [22]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.27      0.37      0.31       497
Mostly Positive       0.20      0.08      0.11       512
       Negative       0.34      0.39      0.36       387
       Positive       0.34      0.47      0.39       610
  Very Positive       0.39      0.21      0.27       359

       accuracy                           0.31      2365
      macro avg       0.31      0.30      0.29      2365
   weighted avg       0.30      0.31      0.29      2365



#### Bag-of-words clásicos

In [23]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize 

stop_words = stopwords.words('english')

# Definimos un tokenizador con Stemming
class StemmerTokenizer:
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        doc_tok = word_tokenize(doc)
        doc_tok = [t for t in doc_tok if t not in stop_words]
        return [self.ps.stem(t) for t in doc_tok]

bow = CountVectorizer(
    tokenizer= StemmerTokenizer(),
    ngram_range=(1,2),
    min_df = 0.05, max_df = 0.85
    )

In [43]:
preprocesisng_bow = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('BoWText',bow,'short_description')
])

pipe_bow = Pipeline([
    ('Pre-procesamiento',preprocesisng_bow),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [44]:
print("Resultados clasificación MLP con bag-of-words\n")

start = time.time_ns()
pipe_bow.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_bow.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

Resultados clasificación MLP con bag-of-words

Time elapsed for training: 8.205354385 seconds

Time elapsed for inference (eval set): 1.5025425110000001 seconds



In [45]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.31      0.24      0.27       497
Mostly Positive       0.29      0.17      0.21       512
       Negative       0.43      0.41      0.42       387
       Positive       0.32      0.52      0.39       610
  Very Positive       0.43      0.35      0.38       359

       accuracy                           0.34      2365
      macro avg       0.35      0.34      0.34      2365
   weighted avg       0.34      0.34      0.33      2365



#### BoW + logits

In [42]:
preprocesisng_logits_bow = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',LogitsEmbedding(),'short_description')
])

pipe_logits_bow = Pipeline([
    ('Pre-procesamiento',preprocesisng_logits_bow),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])


print("Resultados clasificación MLP con logit embeddings + BoW\n")

start = time.time_ns()
pipe_logits_bow.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_logits_bow.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print(classification_report(y_eval,y_pred))

Resultados clasificación MLP con logit embeddings + BoW

Time elapsed for training: 136.28623183300002 seconds

Time elapsed for inference (eval set): 53.621551404 seconds

                 precision    recall  f1-score   support

          Mixed       0.28      0.30      0.29       497
Mostly Positive       0.26      0.17      0.21       512
       Negative       0.38      0.48      0.42       387
       Positive       0.35      0.41      0.38       610
  Very Positive       0.42      0.36      0.38       359

       accuracy                           0.34      2365
      macro avg       0.34      0.34      0.34      2365
   weighted avg       0.33      0.34      0.33      2365



### Regresión

In [28]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [46]:
df_train = pd.read_pickle('train.pickle')
df_train = custom_features(df_train)
X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['estimated_sells'], test_size=0.3, random_state=0)

In [30]:
pipe_reg = Pipeline([
    ('Pre-procesamiento',preprocesisng),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Regresor',BaggingRegressor(random_state=0))
])

print("Resultados regresión sin texto\n")

start = time.time_ns()
pipe_reg.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_reg.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

Resultados regresión sin texto

Time elapsed for training: 9.014024873 seconds

Time elapsed for inference (eval set): 0.057883677 seconds

Error cuadrático medio = 730686091203.4044
Score R2 = 0.5924228283913218


In [47]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

MODEL = "distilbert-videogames-descriptions-sells"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

In [48]:
pipe_reg_bow = Pipeline([
    ('Pre-procesamiento',preprocesisng_bow),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Regresor',BaggingRegressor(random_state=0))
])

In [49]:
print("Resultados regresión con bag-of-words\n")

start = time.time_ns()
pipe_reg_bow.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_reg_bow.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

Resultados regresión con bag-of-words

Time elapsed for training: 11.977677203 seconds

Time elapsed for inference (eval set): 1.4346816690000002 seconds

Error cuadrático medio = 824150143306.8201
Score R2 = 0.5402885200173145


In [50]:
def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,1)

class LogitsEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: logits_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

preprocesisng_logits_reg = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',LogitsEmbedding(),'short_description')
])

In [51]:
pipe_reg_logits = Pipeline([
    ('Pre-procesamiento',preprocesisng_logits_reg),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ('Regresor',BaggingRegressor(random_state=0))
])

In [52]:
print("Resultados regresión con logit embeddings\n")

start = time.time_ns()
pipe_reg_logits.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_reg_logits.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

Resultados regresión con logit embeddings

Time elapsed for training: 129.489771709 seconds

Time elapsed for inference (eval set): 52.863194716 seconds

Error cuadrático medio = 784042788295.7003
Score R2 = 0.5626604284372694


In [37]:
print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

Error cuadrático medio = 908859779499.0122
Score R2 = 0.49303743046892634


Un desafío en la tarea es como integrar las columnas de texto. El texto es usualmente una variable complicada de modelar. Por un lado en el curso vimos como usar modelos basados en bag-of-words. Esto consiste en contar las palabras (y eventualmente conjuntos cortos de palabras, llámese n-gramas) y generar columnas categóricas que representen la presencia o no de un palabra o n-grama en el documento en cuestión.

Hemos integrado este tipo de columnas en nuestro modelamiento. Sin embargo, una familia de modelos que es típicamente usada en tiempos recientes son los llamados modelos de lenguaje. Más precisamente, aquellos basados en la arquitectura de red neuronal "Transformer" han tenido un éxito rotundo resolviendo varias tareas de NLP. Usaremos también este modelo para resolver nuestra tarea.

En cualquier caso debemos tener en cuenta que la descripción del juego no tendría que ser a priori una variable importante en la predicción del éxito de este. Predecir si un juego será bueno o no es difícil para un ser humano, que puede decidir si un juego es atractivo o no para jugarlo, sin embargo sus ventas y sobretodo sus calificaciones dependerán de la experiencia de juego, que puede no estar reflejada en la descripción.

Dicho esto, hemos ajustado un modelo de lenguaje pre-entrenado para que predica las calificaciones de este. Esto se realizó usando la biblioteca de HuggingFace transformers. En particular usamos el modelo DistilBERT, que es una variante del exitoso modelo de Google BERT.

Un modelo de lenguaje como este nos entrega varias maneras de codificar texto. Por un lado tenemos vectores contextualizados por cada token en la secuencia. Esto nos permite tener el promedio de estos vectores como representación de la secuencia. También podemos usar el vector contextualizado del token especial [CLF], que es típicamente usado para las tareas de clasificación.

Lamentablemente estás opciones son costosas puesto que el espacio de representaciones de BERT es de alta dimensionalidad (785 en nuestro caso). Esto le agrega complejidad quizás excesiva a nuestros modelos. Es por esto que también consideramos como representación las probabilidades para cada categoría que predice nuesteo modelo. Llamamos a esto logits embeddings.

Experimentos simples y de inspección confirman que las opción logit es más conveniente que usar vectores de promedio y [CLF]. Además, la opción de usar bag-of-words igualmente es competitiva. Finalmente usamos también la opción de no usar texto. Son estas tres opciones (logits, bow y sin-texto) que integraremos a nuestro gridsearch. Cabe señalar que se intentó hacer lo mismo para la tarea de regresión pero los resultados no son satisfactorios.