# Installations

In [None]:
!pip install transformers
#!pip install torch



In [None]:
!pip install prince

Collecting prince
  Downloading prince-0.7.1-py3-none-any.whl (21 kB)
Installing collected packages: prince
Successfully installed prince-0.7.1


# Packages

In [None]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import time
import pandas as pd

# Load Bert pretrained model

In [None]:
model_name = "bert-base-cased" # prendre en compte la casse

In [None]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

In [None]:
model = BertModel.from_pretrained(model_name, output_hidden_states=True) # récupérer la sortie des hidden layers
tokenizer = BertTokenizer.from_pretrained(model_name) # Chaque modèle à son propre tokenizer

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
if device == 'cuda':
  model.cuda() 

# Test of the tokenizer

In [None]:
sample = "let's encode a sentence"
print("raw: ", sample)
tokenized = tokenizer.tokenize(sample)[:512] # Taille maximale des séquences dans bert, même si la sequence est longue
print('tokenized: ', tokenized) 
input_inds = tokenizer.convert_tokens_to_ids(tokenized)
print('encoded: ', input_inds)
## encode ne fait pas parti du vocabulaire
## même principe que la bpe-isation

raw:  let's encode a sentence
tokenized:  ['let', "'", 's', 'en', '##code', 'a', 'sentence']
encoded:  [1519, 112, 188, 4035, 13775, 170, 5650]


In [None]:
tensor_sentence = torch.tensor([input_inds])
tensor_sentence = tensor_sentence.to(device) # put tensor on device = 'cuda'
tensor_sentence

tensor([[ 1519,   112,   188,  4035, 13775,   170,  5650]], device='cuda:0')

In [None]:
output = model(tensor_sentence)
hidden_states = output.hidden_states
len(hidden_states) # la première couche correspond à la couche d'embedding

13

In [None]:
hidden_states[0].shape

torch.Size([1, 7, 768])

In [None]:
hidden_states[0][0].detach().cpu().numpy() # pour le convertir en array 
                                           # detach pour l'éliminer de l'arbre de calcul
                                           # cpu pour quitter cuda
hidden_states = hidden_states[1:]

In [None]:
word_embeddings = [layer_embeddings[0].mean(axis=0).detach().cpu().numpy() for layer_embeddings in hidden_states]

In [None]:
word_embeddings[0].shape
# model, tokenizer, phrase, et retourne liste word_embeddings

(768,)

# Function for tokenizer a sentence

In [None]:
def encode_sentence(model, tokenizer, sentence):  
  tokenized = tokenizer.tokenize(sentence)[:512]
  input_inds = tokenizer.convert_tokens_to_ids(tokenized)

  tensor_sentence = torch.tensor([input_inds])
  tensor_sentence = tensor_sentence.to(device)

  output = model(tensor_sentence)
  hidden_states = output.hidden_states

  hidden_states = hidden_states[1:]

  document_embeddings = [layer_embeddings[0].mean(axis=0).detach().cpu().numpy() for layer_embeddings in hidden_states]

  return document_embeddings

In [None]:
sentences = ["The first sentence", "the second sentence", "the third sentence"]
encoded_sentences = []
for sentence in sentences :
  document_embeddings = encode_sentence(model, tokenizer, sentence)
  document_embeddings = np.vstack(document_embeddings) # or np.array
  encoded_sentences.append(document_embeddings)

In [None]:
concat_embeddings = np.array(encoded_sentences)
concat_embeddings.shape

(3, 12, 768)

In [None]:
final_embeddings = np.swapaxes(concat_embeddings, 0,1)
final_embeddings.shape

(12, 3, 768)

In [None]:
# A function for many sentences
def encode_sentences(model, tokenizer, sentences):
  encoded_sentences = []
  for sentence in sentences :
    document_embeddings = encode_sentence(model, tokenizer, sentence)
    document_embeddings = np.vstack(document_embeddings) # or np.array()
    encoded_sentences.append(document_embeddings)

  concat_embeddings = np.array(encoded_sentences)
  final_embeddings = np.swapaxes(concat_embeddings, 0,1)
  return final_embeddings

# Encode all the dataset

In [None]:
df = pd.read_csv("classic3.csv", index_col=0)
df

Unnamed: 0,text,label
0,Milestones in Cataloging In the case of the pr...,cisi
1,childhood psychosis. a description is given of...,med
2,neonatal hepatitis or familial neonatal obstru...,med
3,Handbook of Comparative Librarianship The firs...,cisi
4,Design and Evaluation of Information Systems T...,cisi
...,...,...
3886,modification of autistic behavior with lsd-25....,med
3887,Patterns of Evaluation in Science: Institution...,cisi
3888,The government of the American Public Library ...,cisi
3889,base pressure at subsonic speeds in the presen...,cran


In [None]:
texts = df['text'].values

In [None]:
s = time.time()
print(device)
matrices = encode_sentences(model, tokenizer, texts)
print(time.time() - s)

cuda
178.4523799419403


In [None]:
matrices.shape

(12, 3891, 768)

# Kmeans

In [None]:
code = {'cisi':0, 'med':1, 'cran':2}
df.label = df.label.map(code)
df

Unnamed: 0,text,label
0,Milestones in Cataloging In the case of the pr...,0
1,childhood psychosis. a description is given of...,1
2,neonatal hepatitis or familial neonatal obstru...,1
3,Handbook of Comparative Librarianship The firs...,0
4,Design and Evaluation of Information Systems T...,0
...,...,...
3886,modification of autistic behavior with lsd-25....,1
3887,Patterns of Evaluation in Science: Institution...,0
3888,The government of the American Public Library ...,0
3889,base pressure at subsonic speeds in the presen...,2


In [None]:
from sklearn.metrics import normalized_mutual_info_score
from sklearn.cluster import KMeans
scores = []
for matrice in matrices:
  kmeans = KMeans(3).fit(matrice)
  scores.append(normalized_mutual_info_score(df.label.values, kmeans.labels_))

print(score)

[0.8905960311111826, 0.8901491080580187, 0.8786968165755835, 0.8621254814370131, 0.8576860912678019, 0.852321142930996, 0.8590769783858758, 0.8471458795699961, 0.8461329547165068, 0.8611885140188348, 0.8660208175683651, 0.8493954541253715]


In [None]:
from sklearn.decomposition import PCA
pca_res = PCA(n_components=0.95)
res = pca_res.fit_transform(matrices[0])

In [None]:
from scipy.linalg

array([[ 0.01172415, -0.00596982,  0.00179749, ..., -0.00255189,
        -0.00317768,  0.00186779],
       [-0.00596982,  0.01902654, -0.0037803 , ...,  0.0031545 ,
         0.00589006, -0.00579815],
       [ 0.00179749, -0.00378029,  0.01087755, ..., -0.00127052,
        -0.00143191,  0.00191979],
       ...,
       [-0.00255188,  0.0031545 , -0.00127052, ...,  0.0147762 ,
        -0.0016207 , -0.00108633],
       [-0.00317768,  0.00589006, -0.00143191, ..., -0.0016207 ,
         0.01549799, -0.00418078],
       [ 0.00186779, -0.00579815,  0.00191979, ..., -0.00108633,
        -0.00418078,  0.01408602]], dtype=float32)