In [1]:
import h2o
from h2o.automl import H2OAutoML

"""# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()"""

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16.1" 2022-08-12 LTS; OpenJDK Runtime Environment Corretto-11.0.16.9.1 (build 11.0.16.1+9-LTS); OpenJDK 64-Bit Server VM Corretto-11.0.16.9.1 (build 11.0.16.1+9-LTS, mixed mode)
  Starting server from /home/camilo/miniconda3/envs/prog_cientifica/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbsbk2kbl
  JVM stdout: /tmp/tmpbsbk2kbl/h2o_camilo_started_from_python.out
  JVM stderr: /tmp/tmpbsbk2kbl/h2o_camilo_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Argentina/Buenos_Aires
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.3
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_camilo_jrl6q3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.877 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from preprocessing import Nothing, CategoriesTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.pipeline import Pipeline
import re
import pandas as pd

class Nothing(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X


class CategoriesTokenizer:
    def __init__(self):
        pass

    def __call__(self, doc):
        return doc.split(';')

boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05
    )


boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1
    )


def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())

    df['revenue'] = pd.Series([0 for _ in range(len(dataframe_in))])

    df.loc[df.publisher.str.match('.*microsoft.*', flags=re.IGNORECASE).values, 'revenue'] = 10.260
    df.loc[df.publisher.str.match('.*netease.*', flags=re.IGNORECASE).values, 'revenue'] = 6.668
    df.loc[df.publisher.str.match('.*activision.*', flags=re.IGNORECASE).values, 'revenue'] = 6.388
    df.loc[df.publisher.str.match('.*electronic.*', flags=re.IGNORECASE).values, 'revenue'] = 5.537
    df.loc[df.publisher.str.match('.*bandai.*', flags=re.IGNORECASE).values, 'revenue'] = 3.018
    df.loc[df.publisher.str.match('.*square.*', flags=re.IGNORECASE).values, 'revenue'] = 2.386
    df.loc[df.publisher.str.match('.*nexon.*', flags=re.IGNORECASE).values, 'revenue'] = 2.286
    df.loc[df.publisher.str.match('.*ubisoft.*', flags=re.IGNORECASE).values, 'revenue'] = 1.446
    df.loc[df.publisher.str.match('.*konami.*', flags=re.IGNORECASE).values, 'revenue'] = 1.303
    df.loc[df.publisher.str.match('.*SEGA.*').values, 'revenue'] = 1.153
    df.loc[df.publisher.str.match('.*capcom.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7673
    df.loc[df.publisher.str.match('.*warner.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7324

    return df


preprocessing_bert = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english','bert1','bert2','bert3','bert4','bert5'])
])

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import numpy as np

MODEL = "distilbert-videogame-descriptions-rating"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentence_clf_output(text):
    """retorna el SequenceClassifierOutput"""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,5)

def integrar_bert_logits(df_in):
    df = df_in.copy(deep=True)

    embed = lambda row: logits_embedding(sentence_clf_output(row))
    bert_logits = np.concatenate(df['short_description'].apply(embed).to_numpy())  # .reshape(100,3)

    df[['bert1','bert2','bert3','bert4','bert5']] = pd.DataFrame(bert_logits, index= df.index)

    return df

def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())
    return df

In [4]:
df_train = pd.read_pickle('train.pickle')
df_train = integrar_bert_logits(df_train)
df_train = custom_features(df_train)

In [5]:
columns = df_train.columns
columns = list(columns)
columns.remove('rating')
columns.remove('estimated_sells')

In [6]:
hf_train = h2o.H2OFrame(df_train)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=columns, y='rating', training_frame=hf_train)

AutoML progress: |
00:50:07.285: _train param, Dropping bad and constant columns: [name, short_description]

█████████████████████████████████████████████████████
01:02:16.580: _train param, Dropping bad and constant columns: [name, short_description]

██
01:04:24.237: _train param, Dropping bad and constant columns: [name, short_description]


01:04:31.254: _train param, Dropping bad and constant columns: [name, short_description]

██████
01:14:47.414: _train param, Dropping bad and constant columns: [name, short_description]


01:15:16.359: _train param, Dropping bad and constant columns: [name, short_description]


01:15:22.674: _train param, Dropping bad and constant columns: [name, short_description]


01:15:30.921: _train param, Dropping bad and constant columns: [name, short_description]


01:15:45.279: _train param, Dropping bad and constant columns: [name, short_description]

█
01:18:03.479: _train param, Dropping bad and constant columns: [name, short_description]


01:19:05.

Mixed,Mostly Positive,Negative,Positive,Very Positive,Error,Rate
1585.0,2.0,3.0,66.0,0.0,0.0428744,"71 / 1,656"
6.0,1681.0,2.0,13.0,5.0,0.0152314,"26 / 1,707"
0.0,0.0,1290.0,0.0,0.0,0.0,"0 / 1,290"
0.0,3.0,4.0,1759.0,265.0,0.1339242,"272 / 2,031"
0.0,3.0,0.0,0.0,1194.0,0.0025063,"3 / 1,197"
1591.0,1689.0,1299.0,1838.0,1464.0,0.0472021,"372 / 7,881"

k,hit_ratio
1,0.9527979
2,0.9989849
3,0.9998732
4,1.0
5,1.0

Mixed,Mostly Positive,Negative,Positive,Very Positive,Error,Rate
532.0,204.0,317.0,537.0,66.0,0.678744,"1,124 / 1,656"
368.0,293.0,184.0,756.0,106.0,0.8283538,"1,414 / 1,707"
335.0,112.0,575.0,254.0,14.0,0.5542636,"715 / 1,290"
292.0,229.0,115.0,1150.0,245.0,0.4337765,"881 / 2,031"
66.0,74.0,35.0,580.0,442.0,0.6307435,"755 / 1,197"
1593.0,912.0,1226.0,3277.0,873.0,0.6203527,"4,889 / 7,881"

k,hit_ratio
1,0.3796472
2,0.642558
3,0.8318741
4,0.9525441
5,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.3770185,0.004348,0.3762981,0.3841843,0.3757842,0.3764928,0.3723331
auc,,0.0,,,,,
err,0.6229815,0.004348,0.6237019,0.6158157,0.6242158,0.6235072,0.6276669
err_count,981.8,41.045097,1021.0,989.0,995.0,992.0,912.0
logloss,1.3966794,0.0139617,1.3818406,1.4025223,1.408077,1.409489,1.3814679
max_per_class_error,0.8354025,0.0062813,0.8262108,0.8391813,0.8318584,0.8416666,0.8380952
mean_per_class_accuracy,0.3723334,0.0046229,0.3659002,0.3789127,0.3719483,0.3730164,0.3718895
mean_per_class_error,0.6276666,0.0046229,0.6340997,0.6210872,0.6280518,0.6269836,0.6281105
mse,0.5343803,0.0045261,0.5331347,0.5378264,0.5343159,0.5390667,0.5275577
null_deviance,5016.8887,223.34808,5201.0483,5113.2144,5074.2974,5066.9873,4628.896


In [8]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_AllModels_1_AutoML_1_20221214_05007,0.625176,1.39728,0.731049,0.534433
StackedEnsemble_BestOfFamily_1_AutoML_1_20221214_05007,0.632819,1.39886,0.731111,0.534523
XGBoost_grid_1_AutoML_1_20221214_05007_model_3,0.645279,1.4475,0.739698,0.547153
DeepLearning_grid_2_AutoML_1_20221214_05007_model_1,0.645524,1.49349,0.728871,0.531253
XGBoost_grid_1_AutoML_1_20221214_05007_model_2,0.655344,1.49005,0.740581,0.548461
XGBoost_3_AutoML_1_20221214_05007,0.65683,1.44231,0.740801,0.548786
DeepLearning_grid_1_AutoML_1_20221214_05007_model_1,0.659124,1.50631,0.735669,0.541209
XGBoost_1_AutoML_1_20221214_05007,0.661569,1.53459,0.7452,0.555324
DeepLearning_grid_3_AutoML_1_20221214_05007_model_1,0.66446,1.49385,0.733125,0.537472
XGBoost_grid_1_AutoML_1_20221214_05007_model_1,0.664501,1.47977,0.742586,0.551433


Regresión

In [12]:
print(columns)

['name', 'release_date', 'english', 'developer', 'publisher', 'platforms', 'required_age', 'categories', 'genres', 'tags', 'achievements', 'average_playtime', 'price', 'short_description', 'bert1', 'bert2', 'bert3', 'bert4', 'bert5', 'month']


In [16]:
columns.remove('bert1')
columns.remove('bert2')
columns.remove('bert3')
columns.remove('bert4')
columns.remove('bert5')

In [17]:
print(columns)

['name', 'release_date', 'english', 'developer', 'publisher', 'platforms', 'required_age', 'categories', 'genres', 'tags', 'achievements', 'average_playtime', 'price', 'short_description', 'month']


In [18]:
aml_reg = H2OAutoML(max_models=20, seed=1)
aml_reg.train(x=columns, y='estimated_sells', training_frame=hf_train)

AutoML progress: |
07:54:29.667: _train param, Dropping bad and constant columns: [name, short_description]

██████████████████
07:57:32.999: _train param, Dropping bad and constant columns: [name, short_description]

██
07:57:49.334: GLM_1_AutoML_3_20221214_75429 [GLM def_1] failed: DistributedException from /127.0.0.1:54321: 'Java heap space', caused by java.lang.OutOfMemoryError: Java heap space
07:57:49.392: _train param, Dropping bad and constant columns: [name, short_description]
07:57:59.959: _train param, Dropping bad and constant columns: [name, short_description]

█
07:59:13.424: _train param, Dropping bad and constant columns: [name, short_description]

█

In [None]:
lb_reg = aml_reg.leaderboard
lb_reg.head(rows=lb_reg.nrows)

submission

In [None]:
from zipfile import ZipFile
import os

def generateFiles(predict_data, clf_pipe, rgr_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf
    rgr_pipe: pipeline del rgr

    Ouput
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict(predict_data)
    y_pred_rgr = rgr_pipe.predict(predict_data)
    
    with open('./predictions_clf.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)

    with open('./predictions_rgr.txt', 'w') as f:
        for item in y_pred_rgr:
            f.write("%s\n" % item)

    with ZipFile('predictions.zip', 'w') as zipObj2:
       zipObj2.write('predictions_rgr.txt')
       zipObj2.write('predictions_clf.txt')

    os.remove("predictions_rgr.txt")
    os.remove("predictions_clf.txt")

In [None]:
df_test = pd.read_pickle('test.pickle')
df_test = integrar_bert_logits(df_test)
df_test = custom_features(df_test)

In [None]:
generateFiles(df_test,aml,aml_reg)