In [287]:
! pip install transformers
! pip install sacremoses









In [288]:
import pandas as pd

file_name = 'data_in_progres.xlsx'
df = pd.read_excel(io=file_name)

model_names = {
    "herbert-klej-cased-v1": {
        "tokenizer": "allegro/herbert-klej-cased-tokenizer-v1",
        "model": "allegro/herbert-klej-cased-v1",
    },
    "herbert-base-cased": {
        "tokenizer": "allegro/herbert-base-cased",
        "model": "allegro/herbert-base-cased",
    },
    "herbert-large-cased": {
        "tokenizer": "allegro/herbert-large-cased",
        "model": "allegro/herbert-large-cased",
    },
}

In [289]:
from typing import Callable, List, Optional, Tuple
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.bert_tokenizer = bert_tokenizer
        self.bert_model = bert_model
        self.bert_model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        tokenized_text = self.bert_tokenizer.encode_plus(text,
                                                         add_special_tokens=True,
                                                         max_length=self.max_length
                                                         )["input_ids"]

        attention_mask = [1] * len(tokenized_text)

        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.bert_model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        return self

In [290]:
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [291]:
tokenizer = AutoTokenizer.from_pretrained(model_names["herbert-base-cased"]["tokenizer"], use_fast=False)

In [292]:
bert_model = AutoModel.from_pretrained(model_names["herbert-base-cased"]["model"]) 

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [293]:
bert_transformer = BertTransformer(tokenizer, bert_model)

In [294]:
X_1_COLUMN = 'nlp_5'
X_2_COLUMN = 'age'
Y_COLUMN = 'GDT_score'

In [295]:
from sklearn.model_selection import train_test_split

cleaned_df = df[~df[X_1_COLUMN].isna()]
working_df = cleaned_df.filter([X_2_COLUMN, X_1_COLUMN, Y_COLUMN])
df_train, df_test = train_test_split(working_df, test_size = 0.2, random_state=23)

In [296]:
X_train = df_train.drop(Y_COLUMN, axis=1)
Y_train = df_train[Y_COLUMN].values

X_test = df_test.drop(Y_COLUMN, axis=1)
Y_test = df_test[Y_COLUMN].values

In [297]:
def return_text_data(df_):
    return df[X_1_COLUMN].values

def return_numeric_data(df_):
    return df[X_2_COLUMN].values

In [298]:
from sklearn.preprocessing import FunctionTransformer
from sklearn import svm

transformer_text = FunctionTransformer(return_text_data)
transfomer_numeric = FunctionTransformer(return_numeric_data)
classifier = svm.SVR(kernel='linear')

In [299]:
from sklearn.pipeline import Pipeline, FeatureUnion

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
            ('selector', transformer_text)
            ])),
             ('text_features', Pipeline([
                ('selector', transfomer_numeric),
                ("vectorizer", bert_transformer)
            ]))
         ])),
    ('classifier', classifier)
])

In [327]:
pipeline.steps[0]

('features',
 FeatureUnion(transformer_list=[('numeric_features',
                                 Pipeline(steps=[('selector',
                                                  FunctionTransformer(func=<function return_text_data at 0x000001764763D2D0>))])),
                                ('text_features',
                                 Pipeline(steps=[('selector',
                                                  FunctionTransformer(func=<function return_numeric_data at 0x000001764A8FB2E0>)),
                                                 ('vectorizer',
                                                  BertTransformer(bert_model=BertModel(
   (embeddings): BertEmbe...
                                                                  bert_tokenizer=PreTrainedTokenizerFast(name_or_path='allegro/herbert-base-cased', vocab_size=50000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'unk_token': '<unk>', 'sep_token': 

In [323]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = dict(numeric__features=[0],
                  features___text=[0],
                  svm__C=[1])


In [324]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=10)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START features___text=0, numeric__features=0, svm__C=1............


ValueError: Invalid parameter numeric for estimator Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function return_text_data at 0x000001764763D2D0>))])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function return_numeric_data at 0x000001764A8FB2E0>)),
                                                                 ('vectorizer',
                                                                  BertTransformer(bert_model=Ber...
                                                                                  bert_tokenizer=PreTrainedTokenizerFast(name_or_path='allegro/herbert-base-cased', vocab_size=50000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}),
                                                                                  embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x0000017647282320>))]))])),
                ('classifier', SVR(kernel='linear'))]). Check the list of available parameters with `estimator.get_params().keys()`.

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START features___text=1, numeric__features=2, svm__C=1............


ValueError: Invalid parameter numeric for estimator Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function return_text_data at 0x000001764763D2D0>))])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector',
                                                                  FunctionTransformer(func=<function return_numeric_data at 0x000001764A8FB2E0>)),
                                                                 ('vectorizer',
                                                                  BertTransformer(bert_model=Ber...
                                                                                  bert_tokenizer=PreTrainedTokenizerFast(name_or_path='allegro/herbert-base-cased', vocab_size=50000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}),
                                                                                  embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x0000017647282320>))]))])),
                ('classifier', SVR(kernel='linear'))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
result = model.predict(X_test)

In [None]:
resukt

In [None]:
from scipy import stats
print(stats.pearsonr(result, Y_test))