# Borrador integración o no de variable con descripción de texto

Idea:
- tener un pipeline básico para cada una de las tareas
- dijar pre-procesamiento
- compatibilidad con output de modelo de lenguaje
- elegir mejor manera de incluir modelo de lenguaje

**Columnas con categorías**

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

class CategoriesTokenizer:
    def __init__(self):
        pass

    def __call__(self, doc):
        return doc.split(';')

Esta versión de vectorizador es para columnas con pocas categorías posibles (<1k):
- platforms (3 valores posibles)
- categories (29 valores posibles)
- genres (26 valores posibles)
- tags (306 valores posibles)

In [2]:
boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05  # hiperparametro a optimizar
    # valores para GridSearch : [5%, 10%, 15%] ???
    )

Esta otra versión es para developers y publishers (5617 y 3961 valores posibles respectivamente)

In [3]:
boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1  # hiperparametro a optimizar
    # valores para GridSearch : [5, 10, 15] ???
    )

Variable de fecha de publicación

In [4]:
import pandas as pd

def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())
    return df

**Juntando todo**

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder


preprocesisng = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada
])

In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('Pre-procesamiento',preprocesisng),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [7]:
from sklearn.model_selection import train_test_split

df_train = pd.read_pickle('train.pickle')
df_train = custom_features(df_train)
X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['rating'], test_size=0.3, random_state=0, stratify=df_train['rating'])

In [8]:
import time
import math

def timeSince(since):
    now = time.time_ns()
    s = now - since
    return s*10**(-9)

In [9]:
from sklearn.metrics import classification_report

print("Resultados clasificación MLP")
start = time.time_ns()
pipe.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))
print(classification_report(y_eval,y_pred))

Resultados clasificación MLP
Time elapsed for training: 7.887833531 seconds

Time elapsed for inference (eval set): 0.041340058000000006 seconds

                 precision    recall  f1-score   support

          Mixed       0.28      0.29      0.28       497
Mostly Positive       0.24      0.16      0.19       512
       Negative       0.42      0.39      0.40       387
       Positive       0.31      0.42      0.36       610
  Very Positive       0.39      0.37      0.38       359

       accuracy                           0.32      2365
      macro avg       0.33      0.32      0.32      2365
   weighted avg       0.32      0.32      0.32      2365



## Agregando embeddings

In [10]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

### Versión logits

In [11]:
def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class LogitsEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: logits_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [13]:
preprocesisng_logits = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        # ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['...']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',LogitsEmbedding(),'short_description')
])

pipe_logits = Pipeline([
    ('Pre-procesamiento',preprocesisng_logits),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [14]:
print("Resultados clasificación MLP con logit embeddings\n")

start = time.time_ns()
pipe_logits.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_logits.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

print(classification_report(y_eval,y_pred))

Resultados clasificación MLP con logit embeddings

Time elapsed for training: 124.59128301000001 seconds

Time elapsed for inference (eval set): 51.775721214 seconds

                 precision    recall  f1-score   support

          Mixed       0.29      0.24      0.26       497
Mostly Positive       0.26      0.29      0.28       512
       Negative       0.40      0.41      0.41       387
       Positive       0.35      0.46      0.39       610
  Very Positive       0.47      0.24      0.32       359

       accuracy                           0.33      2365
      macro avg       0.35      0.33      0.33      2365
   weighted avg       0.34      0.33      0.33      2365



### Versión token [CLF]

In [15]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token contextualizado
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

class CLFTokenEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: first_tok_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [16]:
preprocesisng_CLFToken = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        # ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['...']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',CLFTokenEmbedding(),'short_description')
])

pipe_CLFToken = Pipeline([
    ('Pre-procesamiento',preprocesisng_CLFToken),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [None]:
print("Resultados clasificación MLP con CLFToken embeddings\n")

start = time.time_ns()
pipe_CLFToken.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_CLFToken.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

3 min approx entre fit y predict

In [21]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.29      0.26      0.27       497
Mostly Positive       0.25      0.20      0.22       512
       Negative       0.41      0.40      0.40       387
       Positive       0.33      0.43      0.37       610
  Very Positive       0.36      0.33      0.34       359

       accuracy                           0.32      2365
      macro avg       0.33      0.32      0.32      2365
   weighted avg       0.32      0.32      0.32      2365



### Versión promedio de embeddings

In [18]:
def mean_embedding(cfl_output):
    # retorna un numpy array correspondiente a la suma de los vectores contextualizados
    return cfl_output['hidden_states'][-1][0].detach().numpy().mean(axis=0).reshape(1,768)

class MeanEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embed = lambda row: mean_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        X_new = np.concatenate(X_new.values)
        return X_new

In [19]:
preprocesisng_mean = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        # ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['...']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('LogitsText',MeanEmbedding(),'short_description')
])

pipe_mean = Pipeline([
    ('Pre-procesamiento',preprocesisng_mean),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [22]:
print("Resultados clasificación MLP con mean embeddings\n")

start = time.time_ns()
pipe_mean.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_mean.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

Resultados clasificación MLP con mean embeddings

Time elapsed for training: 138.760135343 seconds

Time elapsed for inference (eval set): 53.925199555000006 seconds



In [23]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.29      0.35      0.32       497
Mostly Positive       0.24      0.13      0.17       512
       Negative       0.43      0.34      0.38       387
       Positive       0.32      0.50      0.39       610
  Very Positive       0.40      0.25      0.31       359

       accuracy                           0.32      2365
      macro avg       0.34      0.31      0.31      2365
   weighted avg       0.33      0.32      0.31      2365



### Bag-of-words clásicos

In [24]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize 

stop_words = stopwords.words('english')

# Definimos un tokenizador con Stemming
class StemmerTokenizer:
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        doc_tok = word_tokenize(doc)
        doc_tok = [t for t in doc_tok if t not in stop_words]
        return [self.ps.stem(t) for t in doc_tok]

bow = CountVectorizer(
    tokenizer= StemmerTokenizer(),
    ngram_range=(1,2),
    min_df = 0.05, max_df = 0.85
    )

In [25]:
preprocesisng_bow = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        # ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['...']),
        # ('StandardScaler',StandardScaler(), ['...']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime']),
        # ('unchanged',None,['english'])  # chequear como no hacer nada

        ('BoWText',bow,'short_description')
])

pipe_bow = Pipeline([
    ('Pre-procesamiento',preprocesisng_bow),
    ('Clasificador',MLPClassifier(early_stopping =True,max_iter = 100, random_state=0))
])

In [26]:
print("Resultados clasificación MLP con bag-of-words\n")

start = time.time_ns()
pipe_bow.fit(X_train, y_train)
print("Time elapsed for training: {} seconds\n".format(timeSince(start)))
start = time.time_ns()
y_pred = pipe_bow.predict(X_eval)
print("Time elapsed for inference (eval set): {} seconds\n".format(timeSince(start)))

Resultados clasificación MLP con bag-of-words

Time elapsed for training: 8.321603699 seconds

Time elapsed for inference (eval set): 1.5924704680000001 seconds



In [27]:
print(classification_report(y_eval,y_pred))

                 precision    recall  f1-score   support

          Mixed       0.29      0.36      0.32       497
Mostly Positive       0.23      0.18      0.20       512
       Negative       0.39      0.37      0.38       387
       Positive       0.35      0.36      0.36       610
  Very Positive       0.36      0.37      0.37       359

       accuracy                           0.32      2365
      macro avg       0.32      0.33      0.32      2365
   weighted avg       0.32      0.32      0.32      2365

