# GridSearch para clasificación de ratings

**Clasificadores a considerar**:
- linearSVC
- KNeighbors
- RandomForest
- MLP (red neuronal)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import classification_report
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
import re

In [2]:
df_train = pd.read_pickle('train.pickle')[:20]

## DF con BERT integrado

In [5]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

In [6]:
def integrar_bert_logits(df_in):
    df = df_in.copy(deep=True)

    embed = lambda row: logits_embedding(sentence_clf_output(row))
    bert_logits = np.concatenate(df['short_description'].apply(embed).to_numpy())  # .reshape(100,3)

    df[['bert1','bert2','bert3','bert4','bert5']] = pd.DataFrame(bert_logits, index= df.index)

    return df

In [7]:
%%time
df_train = integrar_bert_logits(df_train)

CPU times: user 5.77 s, sys: 207 ms, total: 5.98 s
Wall time: 761 ms


## Pre-procesamiento

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class Nothing(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X


class CategoriesTokenizer:
    def __init__(self):
        pass

    def __call__(self, doc):
        return doc.split(';')

boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05  # hiperparametro a optimizar
    # valores para GridSearch : [5%, 10%, 15%] ???
    )


boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1  # hiperparametro a optimizar
    # valores para GridSearch : [5, 10, 15] ???
    )

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize 

stop_words = stopwords.words('english')

# Definimos un tokenizador con Stemming
class StemmerTokenizer:
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        doc_tok = word_tokenize(doc)
        doc_tok = [t for t in doc_tok if t not in stop_words]
        return [self.ps.stem(t) for t in doc_tok]

bow = CountVectorizer(
    tokenizer= StemmerTokenizer(),
    ngram_range=(1,2),
    min_df = 0.05, max_df = 0.85
    )

def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())

    df['revenue'] = pd.Series([0 for _ in range(len(dataframe_in))])

    df.loc[df.publisher.str.match('.*microsoft.*', flags=re.IGNORECASE).values, 'revenue'] = 10.260
    df.loc[df.publisher.str.match('.*netease.*', flags=re.IGNORECASE).values, 'revenue'] = 6.668
    df.loc[df.publisher.str.match('.*activision.*', flags=re.IGNORECASE).values, 'revenue'] = 6.388
    df.loc[df.publisher.str.match('.*electronic.*', flags=re.IGNORECASE).values, 'revenue'] = 5.537
    df.loc[df.publisher.str.match('.*bandai.*', flags=re.IGNORECASE).values, 'revenue'] = 3.018
    df.loc[df.publisher.str.match('.*square.*', flags=re.IGNORECASE).values, 'revenue'] = 2.386
    df.loc[df.publisher.str.match('.*nexon.*', flags=re.IGNORECASE).values, 'revenue'] = 2.286
    df.loc[df.publisher.str.match('.*ubisoft.*', flags=re.IGNORECASE).values, 'revenue'] = 1.446
    df.loc[df.publisher.str.match('.*konami.*', flags=re.IGNORECASE).values, 'revenue'] = 1.303
    df.loc[df.publisher.str.match('.*SEGA.*').values, 'revenue'] = 1.153
    df.loc[df.publisher.str.match('.*capcom.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7673
    df.loc[df.publisher.str.match('.*warner.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7324

    return df


preprocessing_bert = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english','bert1','bert2','bert3','bert4','bert5'])
])

preprocessing_bow = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english']),
        ('BoWText',bow,'short_description')
])

preprocessing_bert_bow = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english','bert1','bert2','bert3','bert4','bert5']),
        ('BoWText',bow,'short_description')
])

preprocessing = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english']),
])

In [9]:
df_train = custom_features(df_train)
X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['rating'], test_size=0.3, random_state=0, stratify=df_train['rating'])

## Ahora si el gridsearch

In [10]:
from sklearn.svm import LinearSVC


pipeline = Pipeline(
    [("procesamiento", preprocessing),
    ("selector", SelectPercentile(f_classif, percentile=90)),
    ("classifier", LinearSVC(random_state=0))]
)

## SVC Lineal

In [13]:
grilla_SV = {
    "classifier": [LinearSVC(random_state=0)],
    "classifier__C":[0.01, 1., 100., 1000.],
    "selector__percentile": [25, 50, 75, 95, 100],
    "selector__score_func":[f_classif],
    "procesamiento" : [preprocessing, preprocessing_bert, preprocessing_bert_bow, preprocessing_bow],
    "procesamiento__BoC-cat__min_df" : [0.05, 0.075],
    "procesamiento__BoC-genres__min_df" : [0.05, 0.075],
    "procesamiento__BoC-tags__min_df" : [0.05, 0.075],
    "procesamiento__BoC-dev__min_df" : [2, 5],
    "procesamiento__BoC-pub__min_df" : [2, 5],
}

gs_SV = HalvingGridSearchCV(
    pipeline,
    grilla_SV,
    scoring = 'f1_weighted',
    n_jobs=5,
    verbose = 0,
    random_state = 0,
    error_score = 0
)

In [14]:
gs_SV.fit(X_train, y_train)

ValueError: min_resources_=50 is greater than max_resources_=14.

In [None]:
from sklearn.metrics import get_scorer_names

get_scorer_names()

El mejor modelo encontrado es el siguiente

In [None]:
gs_SV.best_params_ 

y sus métricas son

In [None]:
gs_SV.best_score_

In [None]:
from sklearn.metrics import classification_report

print("Resultados clasificación SupportVector")
gs_SV.fit(X_train, y_train)
y_pred = gs_SV.predict(X_eval)
print(classification_report(y_eval,y_pred))

Resultados generales del gridsearch:

In [None]:
pd.DataFrame(gs_SV.cv_results_)

In [None]:
pd.DataFrame(gs_SV.cv_results_)['mean_test_score'].max()

## KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
grilla_KN = {
    "classifier": [KNeighborsClassifier()],
    "classifier__n_neighbors" :[3, 5, 10],
    "classifier__weights" : ['uniform', 'distance'],
    "selector__percentile": [25, 50, 75, 95, 100],
    "selector__score_func":[f_classif],
    "procesamiento" : [preprocessing, preprocessing_bert, preprocessing_bert_bow, preprocessing_bow],
    "procesamiento__BoC-cat__min_df" : [0.05, 0.075],
    "procesamiento__BoC-genres__min_df" : [0.05, 0.075],
    "procesamiento__BoC-tags__min_df" : [0.05, 0.075],
    "procesamiento__BoC-dev__min_df" : [2, 5],
    "procesamiento__BoC-pub__min_df" : [2, 5],
}

gs_KN = HalvingGridSearchCV(
    pipeline,
    grilla_KN,
    scoring = 'f1_weighted',
    n_jobs=-1,
    verbose = 0,
    random_state = 0,
    error_score = 0
)

In [None]:
gs_KN.fit(X_train, y_train)

El mejor modelo encontrado es el siguiente

In [None]:
gs_KN.best_params_ 

y sus métricas son

In [None]:
gs_KN.best_score_

In [None]:
from sklearn.metrics import classification_report

print("Resultados clasificación K-Neighbors")
gs_KN.fit(X_train, y_train)
y_pred = gs_KN.predict(X_eval)
print(classification_report(y_eval,y_pred))

Resultados generales del gridsearch:

In [None]:
pd.DataFrame(gs_KN.cv_results_)

In [None]:
pd.DataFrame(gs_KN.cv_results_)['mean_test_score'].max()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
grilla_RF = {
    "classifier": [RandomForestClassifier(random_state=0)],
    "classifier__n_estimators" :[25, 50, 100, 250, 500],
    "selector__percentile": [25, 50, 75, 95, 100],
    "selector__score_func":[f_classif],
    "procesamiento" : [preprocessing, preprocessing_bert, preprocessing_bert_bow, preprocessing_bow],
    "procesamiento__BoC-cat__min_df" : [0.05, 0.075],
    "procesamiento__BoC-genres__min_df" : [0.05, 0.075],
    "procesamiento__BoC-tags__min_df" : [0.05, 0.075],
    "procesamiento__BoC-dev__min_df" : [2, 5],
    "procesamiento__BoC-pub__min_df" : [2, 5],
}

gs_RF = HalvingGridSearchCV(
    pipeline,
    grilla_RF,
    scoring = 'f1_weighted',
    n_jobs=-1,
    verbose = 0,
    random_state = 0,
    error_score = 0
)

In [None]:
gs_RF.fit(X_train, y_train)

El mejor modelo encontrado es el siguiente

In [None]:
gs_RF.best_params_ 

y sus métricas son

In [None]:
gs_RF.best_score_

In [None]:
from sklearn.metrics import classification_report

print("Resultados clasificación Random Forest")
gs_RF.fit(X_train, y_train)
y_pred = gs_RF.predict(X_eval)
print(classification_report(y_eval,y_pred))

Resultados generales del gridsearch:

In [None]:
pd.DataFrame(gs_RF.cv_results_)

In [None]:
pd.DataFrame(gs_RF.cv_results_)['mean_test_score'].max()

# Multi layer perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
grilla_MLP = {
    "classifier": [MLPClassifier()],
    "classifier__hidden_layer_sizes":[(100,),(200,),(100,50,),(200,100,)],
    "classifier__solver" : ['sgd', 'adam'],
    "classifier__learning_rate" : ['constant', 'invscaling', 'adaptive'],
    "procesamiento" : [preprocessing, preprocessing_bert],
    "classifier__learning_rate_init" : [0.1,0.01,0.001,0.0001],
    "selector__percentile": [90, 95, 100],
    "selector__score_func":[f_classif],
    "procesamiento__BoC-cat__min_df" : [0.02],
    "procesamiento__BoC-genres__min_df" : [0.02],
    "procesamiento__BoC-tags__min_df" : [0.02],
    "procesamiento__BoC-dev__min_df" : [1],
    "procesamiento__BoC-pub__min_df" : [1],
}

gs_MLP = HalvingGridSearchCV(
    pipeline,
    grilla_MLP,
    scoring = 'f1_weighted',
    n_jobs=-1,
    verbose = 0,
    random_state = 0,
    error_score = 0
)

In [None]:
gs_MLP.fit(X_train, y_train)

El mejor modelo encontrado es el siguiente

In [None]:
gs_MLP.best_params_ 

y sus métricas son

In [None]:
gs_MLP.best_score_

In [None]:
from sklearn.metrics import classification_report

print("Resultados clasificación MLP")
gs_MLP.fit(X_train, y_train)
y_pred = gs_MLP.predict(X_eval)
print(classification_report(y_eval,y_pred))

Resultados generales del gridsearch:

In [None]:
pd.DataFrame(gs_MLP.cv_results_)

In [None]:
pd.DataFrame(gs_MLP.cv_results_)['mean_test_score'].max()