#  Exploración de datos usando modelos pre-entrenados

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel

**Carga/descarga del tokenizer**

In [2]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"
folder = MODEL.replace('plncmm','modelos')

try:
    tokenizer = AutoTokenizer.from_pretrained(folder)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.save_pretrained(folder)

Downloading:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/735k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

**Carga/descarga del modelo**

In [3]:
try:
    model = AutoModel.from_pretrained(folder)
except Exception:
    model = AutoModel.from_pretrained(MODEL)
    model.save_pretrained(folder)

Downloading:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at plncmm/bert-clinical-scratch-wl-es and are newly initialized: ['bert.pooler.dense.weight',

In [4]:
import pandas as pd
import numpy as np

main_db = pd.read_csv('../datos/DATA_HLF_MDS_2.csv', sep=',').set_index('index')
# main_db.info()
main_db.shape

(1526557, 20)

## Vectorización de texto RESUMEN

In [5]:
ejemplo = "1 COMPRIMIDO ORAL cada 12 horas durante 7 dias"

In [6]:
def sentence_clf_output(text):
    # retorna el SequenceClassifierOutput dado un tweet
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

In [7]:
output_obj = sentence_clf_output(ejemplo)
print(output_obj.keys())
output_obj['hidden_states'][-1].shape

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])


torch.Size([1, 11, 768])

In [8]:
token_ids_ej = tokenizer(ejemplo, return_tensors='np')['input_ids'][0]
token_ids_ej

array([    4,  1098, 29258, 12791,  1748,  1992,  2596,  1672,   999,
       12873,     5])

In [9]:
tokenizer.decode(token_ids_ej)

'[CLS] 1 comprimido oral cada 12 horas durante 7 dias [SEP]'

In [10]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token <s> contextualizado según el tweet
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

In [11]:
emb1 = first_tok_embedding(output_obj)
print(type(emb1))
emb1.shape

<class 'numpy.ndarray'>


(1, 768)

A continuación seleccionamos N elementos y los vectorizamos.

In [12]:
N = 1000
samples = main_db.sample(n=N, random_state=0)['RESUMEN']

In [13]:
%%time
for texto in samples:
    clf_obj = sentence_clf_output(texto)
    try:
        vectores = np.concatenate([first_tok_embedding(clf_obj),vectores], axis=0)
    except NameError:
        vectores = first_tok_embedding(clf_obj)

CPU times: user 3min 41s, sys: 1.68 s, total: 3min 43s
Wall time: 37.3 s


In [14]:
vectores.shape

(1000, 768)

In [15]:
from sklearn.manifold import TSNE

emb_tsne = TSNE().fit_transform(vectores)



In [16]:
import plotly.express as px

fig = px.scatter(
    x=emb_tsne[:,0], y=emb_tsne[:,1], title='Visualización con t-SNE',
    hover_name=samples
)

fig.show()