#  Exploración de datos usando modelos pre-entrenados

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel

**Carga/descarga del tokenizer**

In [2]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"
folder = MODEL.replace('plncmm','modelos')

try:
    tokenizer = AutoTokenizer.from_pretrained(folder)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.save_pretrained(folder)

**Carga/descarga del modelo**

In [3]:
try:
    model = AutoModel.from_pretrained(folder)
except Exception:
    model = AutoModel.from_pretrained(MODEL)
    model.save_pretrained(folder)

In [4]:
import pandas as pd
import numpy as np

N = 1000
# samples = pd.read_csv('../datos/DATA_HLF_MDS_2.csv', sep=',')['RESUMEN'].drop_duplicates().sample(n=N, random_state=0)
samples = pd.read_csv('../datos/DATA_HLF_MDS_2.csv', sep=',').sample(n=N, random_state=0)

## Vectorización de texto RESUMEN

In [5]:
ejemplo = "1 COMPRIMIDO ORAL cada 12 horas durante 7 dias"

In [6]:
def sentence_clf_output(text):
    # retorna el SequenceClassifierOutput dado un tweet
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    return output

In [7]:
output_obj = sentence_clf_output(ejemplo)
print(output_obj.keys())
output_obj['hidden_states'][-1].shape

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])


torch.Size([1, 11, 768])

In [8]:
token_ids_ej = tokenizer(ejemplo, return_tensors='np')['input_ids'][0]
token_ids_ej

array([    4,  1098, 29258, 12791,  1748,  1992,  2596,  1672,   999,
       12873,     5])

In [9]:
tokenizer.decode(token_ids_ej)

'[CLS] 1 comprimido oral cada 12 horas durante 7 dias [SEP]'

In [10]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token <s> contextualizado según el tweet
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

In [11]:
emb1 = first_tok_embedding(output_obj)
print(type(emb1))
emb1.shape

<class 'numpy.ndarray'>


(1, 768)

In [12]:
%%time
for texto in samples['RESUMEN']:
    clf_obj = sentence_clf_output(texto)
    try:
        vectores = np.concatenate([first_tok_embedding(clf_obj),vectores], axis=0)
    except NameError:
        vectores = first_tok_embedding(clf_obj)

CPU times: user 3min 10s, sys: 1.19 s, total: 3min 11s
Wall time: 31.9 s


In [13]:
vectores.shape

(1000, 768)

In [14]:
from sklearn.manifold import TSNE

emb_tsne = TSNE(init='pca',random_state=0).fit_transform(vectores)



In [15]:
import plotly.express as px

fig = px.scatter(
    samples, x=emb_tsne[:,0], y=emb_tsne[:,1], title='Visualización con t-SNE',
    hover_name='RESUMEN', hover_data=['DIAGDESC','ESPECIALIDAD','CODIGO_MEDICAMENTO'], color='ESPECIALIDAD'
)

fig.show()

In [16]:
import umap

umap_emb = umap.UMAP(n_neighbors=10,n_components=2,random_state=0).fit_transform(vectores)

In [20]:
fig = px.scatter(
    samples, x=umap_emb[:,0], y=umap_emb[:,1], title='Visualización con UMAP',
    hover_name='RESUMEN', hover_data=['DIAGDESC','ESPECIALIDAD','CODIGO_MEDICAMENTO'], color='ESPECIALIDAD'
)

fig.show()

In [21]:
def sum_embedding(cfl_output):
    # retorna un numpy array correspondiente a la suma de los vectores contextualizados
    return cfl_output['hidden_states'][-1][0].detach().numpy().mean(axis=0).reshape(1,768)

In [26]:
%%time
for texto in samples['RESUMEN']:
    clf_obj = sentence_clf_output(texto)
    try:
        vectores_sum = np.concatenate([sum_embedding(clf_obj),vectores_sum], axis=0)
    except NameError:
        vectores_sum = sum_embedding(clf_obj)

CPU times: user 3min 10s, sys: 1.53 s, total: 3min 12s
Wall time: 32.1 s


In [27]:
vectores_sum.shape

(1000, 768)

In [29]:
emb_tsne_sum = TSNE(init='pca',random_state=0).fit_transform(vectores_sum)

fig = px.scatter(
    samples, x=emb_tsne_sum[:,0], y=emb_tsne_sum[:,1], title='Visualización con t-SNE, vectores suma',
    hover_name='RESUMEN', hover_data=['DIAGDESC','ESPECIALIDAD','CODIGO_MEDICAMENTO'], color='ESPECIALIDAD'
)

fig.show()


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



In [30]:
umap_emb_sum = umap.UMAP(n_neighbors=10,n_components=2,random_state=0).fit_transform(vectores_sum)

fig = px.scatter(
    samples, x=umap_emb_sum[:,0], y=umap_emb_sum[:,1], title='Visualización con UMAP, vectores suma',
    hover_name='RESUMEN', hover_data=['DIAGDESC','ESPECIALIDAD','CODIGO_MEDICAMENTO'], color='ESPECIALIDAD'
)

fig.show()