# Experimentos para clasificación

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

In [3]:
def integrar_bert_logits(df_in):
    df = df_in.copy(deep=True)

    embed = lambda row: logits_embedding(sentence_clf_output(row))
    bert_logits = np.concatenate(df['short_description'].apply(embed).to_numpy())  # .reshape(100,3)

    df[['bert1','bert2','bert3','bert4','bert5']] = pd.DataFrame(bert_logits, index= df.index)

    return df

In [4]:
df_train = pd.read_pickle('train.pickle')
df_train = integrar_bert_logits(df_train)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from preprocessing import Nothing, CategoriesTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.pipeline import Pipeline
import re

class Nothing(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X


class CategoriesTokenizer:
    def __init__(self):
        pass

    def __call__(self, doc):
        return doc.split(';')

boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05
    )


boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1
    )


def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())

    df['revenue'] = pd.Series([0 for _ in range(len(dataframe_in))])

    df.loc[df.publisher.str.match('.*microsoft.*', flags=re.IGNORECASE).values, 'revenue'] = 10.260
    df.loc[df.publisher.str.match('.*netease.*', flags=re.IGNORECASE).values, 'revenue'] = 6.668
    df.loc[df.publisher.str.match('.*activision.*', flags=re.IGNORECASE).values, 'revenue'] = 6.388
    df.loc[df.publisher.str.match('.*electronic.*', flags=re.IGNORECASE).values, 'revenue'] = 5.537
    df.loc[df.publisher.str.match('.*bandai.*', flags=re.IGNORECASE).values, 'revenue'] = 3.018
    df.loc[df.publisher.str.match('.*square.*', flags=re.IGNORECASE).values, 'revenue'] = 2.386
    df.loc[df.publisher.str.match('.*nexon.*', flags=re.IGNORECASE).values, 'revenue'] = 2.286
    df.loc[df.publisher.str.match('.*ubisoft.*', flags=re.IGNORECASE).values, 'revenue'] = 1.446
    df.loc[df.publisher.str.match('.*konami.*', flags=re.IGNORECASE).values, 'revenue'] = 1.303
    df.loc[df.publisher.str.match('.*SEGA.*').values, 'revenue'] = 1.153
    df.loc[df.publisher.str.match('.*capcom.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7673
    df.loc[df.publisher.str.match('.*warner.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7324

    return df


preprocessing_bert = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english','bert1','bert2','bert3','bert4','bert5'])
])

In [6]:
df_train = custom_features(df_train)
# X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['rating'], test_size=0.3, random_state=0, stratify=df_train['rating'])

In [7]:
def make_pipeline(clf):
    pipeline = Pipeline(
        [("procesamiento", preprocessing_bert),
        ("selector", SelectPercentile(f_classif, percentile=95)),
        ("classifier", clf)]
    )
    return pipeline

# Borrador de GridSearch (clasificación)

**Candidatos**:
- SVCLineal
- KNeighbors
- RandomForest
- MLP

In [8]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

In [9]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore');

In [10]:
clasificadores_exp = [
    LinearSVC(random_state=0),
    KNeighborsClassifier(weights='distance'),
    RandomForestClassifier(n_estimators=250, random_state=0),
    MLPClassifier(hidden_layer_sizes=(200,), learning_rate_init=0.01, solver='sgd',random_state = 0),
    VotingClassifier(estimators=[
        ('SVC', LinearSVC(random_state=0)),
        ('KN', KNeighborsClassifier(weights='distance')),
        ('RF', RandomForestClassifier(n_estimators=250, random_state=0)),
        ('MLP', MLPClassifier(hidden_layer_sizes=(200,), learning_rate_init=0.01, solver='sgd',random_state = 0))
    ])
]

In [11]:
%%capture
results_cv = {}

for classif in clasificadores_exp:
    pipe = make_pipeline(reg)
    score = cross_val_score(pipe,df_train, df_train['rating'],scoring='f1_weighted')
    results_cv[type(reg).__name__] = score
    # print("CV SCORE {}: {}".format(type(reg).__name__,score))

In [12]:
for classif in results_cv.keys():
    print("CV SCORE {}: {}".format(classif,results_cv[classif]))

CV SCORE LinearSVC: [0.36057687 0.34802132 0.37176183 0.35326648 0.34257792]
CV SCORE KNeighborsClassifier: [0.29278178 0.28670371 0.31304605 0.31712508 0.30444146]
CV SCORE RandomForestClassifier: [0.35857368 0.36460172 0.37187043 0.35138941 0.35626242]
CV SCORE MLPClassifier: [0.36815039 0.35760368 0.37994783 0.37399761 0.35189534]
CV SCORE VotingClassifier: [0.36662592 0.36160182 0.37349064 0.37553451 0.35211995]


| Modelo | Fold 1 | Fold 2 | Fold 3 | Fold 4 | Fold 5 | **Promedio** |
|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| **LinearSVC** | 0.36057687 | 0.34802132 | 0.37176183 | 0.35326648 | 0.34257792 | **0.3551** |
| **KNeighborsClassifier** | 0.29278178 | 0.28670371 | 0.31304605 | 0.31712508 | 0.30444146 | **0.3027** |
| **RandomForestClassifier** | 0.35857368 | 0.36460172 | 0.37187043 | 0.35138941 | 0.35626242 | **0.3604** |
| **MLPClassifier** | 0.36815039 | 0.35760368 | 0.37994783 | 0.37399761 | 0.35189534 | **0.3662** |
| **VotingClassifier** | 0.36662592 | 0.36160182 | 0.37349064 | 0.37553451 | 0.35211995 | **0.3658** |