# Borrador de pre-procesamiento

## Pre-procesamiento de columna de texto

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

In [2]:
MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [3]:
def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput dado un tweet"""
    # text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

In [4]:
example = "This game is honestly rubbish, it doesn't even deserve a description."
output_obj = sentence_clf_output(example)


output_obj.keys()

odict_keys(['logits', 'hidden_states'])

In [5]:
output_obj['hidden_states'][-1].shape

torch.Size([1, 17, 768])

In [6]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token contextualizado
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

emb1 = first_tok_embedding(output_obj)
print(type(emb1))
emb1.shape

<class 'numpy.ndarray'>


(1, 768)

In [7]:
def sum_embedding(cfl_output):
    # retorna un numpy array correspondiente a la suma de los vectores contextualizados
    return cfl_output['hidden_states'][-1][0].detach().numpy().mean(axis=0).reshape(1,768)

emb2 = sum_embedding(output_obj)
print(type(emb2))
emb2.shape

<class 'numpy.ndarray'>


(1, 768)

In [9]:
def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

emb3 = logits_embedding(output_obj)
print(type(emb3))
emb3.shape

<class 'numpy.ndarray'>


(1, 5)

---

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

class MinMax(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.xmin = X.min()
        self.xmax = X.max()
        return self

    def transform(self, X):
        new_X = X.apply(lambda row: (row - self.xmin)/(self.xmax - self.xmin))
        # new_X = (X - self.xmin)/(self.xmax - self.xmin)
        print(type(new_X))
        return new_X

minmax_transformer = MinMax()

In [36]:
import pandas as pd

df = pd.read_pickle('train.pickle')

In [37]:
# new_df = minmax_transformer.fit_transform(df[['average_playtime','achievements']])
new_df = minmax_transformer.fit_transform(df['average_playtime'])

<class 'pandas.core.series.Series'>


In [38]:
new_df

0       0.000000
1       0.000341
2       0.001138
3       0.006505
4       0.001285
          ...   
7876    0.000341
7877    0.000913
7878    0.000000
7879    0.000000
7880    0.000393
Name: average_playtime, Length: 7881, dtype: float64

In [41]:
columna = df['short_description'][:10]
columna

0    One day your roommate Leaves for no reason. Yo...
1    Manage a team of ghosthunters and free London ...
2    In Deponia, the world has degenerated into a v...
3    SEASON 6 NOW LIVE! The battle for Atlas contin...
4    CHUCHEL is a comedy adventure game from the cr...
5    LocoSoccer is a fun, crazy physics-based socce...
6    A sci-fi colony sim driven by an intelligent A...
7    A Wild Catgirl Appears is a short romance visu...
8    Infernium, a survival horror approach to Pac-M...
9    MINDNIGHT is a free-to-play online multiplayer...
Name: short_description, dtype: object

In [42]:
columna.apply(lambda row: logits_embedding(sentence_clf_output(row)))

0    [[-0.30798036, 0.10446549, 0.084020175, 0.3444...
1    [[0.08786825, 0.4088722, 0.15083486, 0.0637540...
2    [[-0.2446493, 0.0029724466, 0.049106434, 0.311...
3    [[-0.24621132, 0.017480843, 0.033693705, 0.330...
4    [[-0.42932254, 0.0114853345, 0.2061961, 0.3339...
5    [[0.067411326, 0.36655292, 0.031259853, 0.0803...
6    [[-0.677581, -0.051414825, 0.4004209, 0.501398...
7    [[-0.46146762, -0.11238558, 0.12086479, 0.3856...
8    [[-0.56208116, -0.052706547, 0.23826675, 0.530...
9    [[-0.08055741, 0.32602018, 0.14793858, 0.21990...
Name: short_description, dtype: object

In [43]:
class CategoriesTokenizer:
    def __init__(self):
        pass
        # self.ps = PorterStemmer()
    def __call__(self, doc):
        return doc.split(';')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

bog = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    min_df = 0.1  # hiperparametro a optimizar
    )

"""preprocessing = ColumnTransformer(
    transformers=[
        ('bag-of-categories',bog,['platforms','categories','genres','tags']),
        ('minmax',MinMaxScaler(),['...'])
    ]
)""";

In [46]:
small_df = df[:10]
# small_df

In [48]:
hola = bog.fit_transform(small_df[['tags','genres']])
hola

<2x2 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [49]:
chao = bog.fit_transform(small_df['tags'])
chao

<10x17 sparse matrix of type '<class 'numpy.int64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [50]:
print(chao)

  (0, 4)	1
  (0, 0)	1
  (0, 6)	1
  (1, 6)	1
  (1, 15)	1
  (1, 11)	1
  (2, 0)	1
  (2, 9)	1
  (2, 3)	1
  (3, 4)	1
  (3, 8)	1
  (3, 13)	1
  (4, 0)	1
  (4, 6)	1
  (4, 2)	1
  (5, 6)	1
  (5, 13)	1
  (5, 12)	1
  (6, 13)	1
  (6, 1)	1
  (6, 14)	1
  (7, 0)	1
  (7, 7)	1
  (7, 16)	1
  (8, 0)	1
  (8, 6)	1
  (8, 5)	1
  (9, 4)	1
  (9, 13)	1
  (9, 10)	1


### Integración con scikit-learn

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class LogitsEmbedding(BaseEstimator, TransformerMixin):
    """def fit(self, X, y=None, modelo=model, tokenizador=tokenizer):
        self.model = modelo
        self.tokenizer = tokenizador
        return self"""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # UPDATE
        embed = lambda row: logits_embedding(sentence_clf_output(row))
        X_new = X.apply(embed)
        return X_new
    
"""np.random.seed(42)

scaling_minmax = Pipeline([('scaler',MinMax())])

minmax_transformer = ColumnTransformer(
    transformers=[
        ('MinMaxScaler',MinMax(), ['Length','Recency','Frequency','Monetary','Periodicity'])
])

lrmfp_tsne_pipeline = Pipeline([
    # aplicar transformaciones para custom features
    ('LRMFP',FunctionTransformer(custom_features)),
    ('MinMaxScaler',minmax_transformer),
    ('TSNE',TSNE())
])"""

---

### Columnas

- [x] **name** -> Ignorar
- [-] **release_date** -> TO DO: separar mes y año
- [?] **english** -> poner tal cual?
- [x] **developer** -> BoW
- [x] **publisher** -> BoW
- [x] **platforms** -> BoW
- [x] **required_age** -> Minmax scaler
- [x] **categories** -> BoW
- [x] **genres** -> BoW
- [x] **tags** -> BoW
- [x] **achievements** -> power scaler
- [x] **average_playtime** -> power scaler
- [x] **price** -> minmax scaler
- [-] **short_description** -> embeddings entrenados, verificar mejor manera de integrarlos