In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install sentencepiece
!pip install transformers

In [61]:
from tqdm.auto import tqdm

In [8]:
import torch
from transformers import CamembertModel, CamembertTokenizer

In [18]:
model_name = "camembert/camembert-base" 
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = CamembertModel.from_pretrained(model_name)

In [27]:
tokenized_sentence = tokenizer.tokenize("fromage")
tokenized_sentence

['▁fromage']

In [28]:
encoded_sentence = tokenizer.encode(tokenized_sentence, return_tensors="pt")
encoded_sentence

tensor([[   5, 5271,    6]])

In [29]:
output = model(encoded_sentence)

In [30]:
output.last_hidden_state

tensor([[[-0.0592,  0.1559,  0.1587,  ..., -0.0393, -0.0136, -0.0732],
         [ 0.0129,  0.0980,  0.1464,  ..., -0.0767,  0.0197, -0.0514],
         [-0.0673, -0.0669, -0.0682,  ..., -0.0299,  0.0679,  0.0939]]],
       grad_fn=<NativeLayerNormBackward0>)

In [77]:
print(output.last_hidden_state.tolist()[0][1])

[0.0103547852486372, 0.11577501893043518, 0.1415213793516159, -0.13417485356330872, -0.011738214641809464, 0.04924704134464264, 0.05636679381132126, 0.2124384641647339, 0.015817614272236824, 0.05223260447382927, 0.04529968649148941, 0.1473664939403534, -0.03848937898874283, 0.08483855426311493, 0.22709961235523224, 0.03628159314393997, 0.04804034158587456, -0.10260544717311859, 0.11319787800312042, -0.1434229165315628, 0.04219117760658264, -0.07806473970413208, -0.005882613360881805, -0.35138699412345886, 0.23521076142787933, -0.19420436024665833, -0.03787880390882492, -0.0983339175581932, -0.04269667714834213, 0.042229097336530685, 0.034495290368795395, -0.21455423533916473, 0.07983303815126419, 0.1112215518951416, 0.1796291470527649, -0.1345979869365692, -0.059356607496738434, 0.15660767257213593, -0.10297033190727234, -0.07735864073038101, -0.14564178884029388, 0.05280586704611778, 0.2045353353023529, -0.06680627912282944, 0.12153518199920654, 0.18465834856033325, -0.250654458999633

In [74]:
print(torch.mean(output.last_hidden_state, 1).tolist()[0])

[-0.034405458718538284, 0.13604597747325897, 0.09219849854707718, -0.07692411541938782, -0.021292567253112793, 0.03322410210967064, 0.007805342320352793, 0.16670455038547516, 0.06959977746009827, 0.01482544094324112, 0.026018591597676277, 0.09238534420728683, -0.04735630378127098, 0.11495938897132874, 0.09565114974975586, 0.033323537558317184, 0.05916470289230347, -0.10682153701782227, 0.1087389588356018, -0.1406429409980774, 0.06393458694219589, -0.037848860025405884, 0.024826450273394585, -0.23858503997325897, 0.20689012110233307, -0.12651488184928894, 0.022471049800515175, -0.13977445662021637, -0.038211170583963394, 0.03986980393528938, -0.007436590734869242, -0.11510195583105087, 0.08738519996404648, 0.06140679121017456, 0.12705834209918976, -0.12816332280635834, -0.00760863209143281, 0.06759098172187805, -0.05131547525525093, -0.21679802238941193, -0.07601089775562286, 0.04796428605914116, 0.1356140524148941, -0.03707750886678696, 0.09639143943786621, 0.161521777510643, -0.169752

In [75]:
def get_embeddings(word):
    encoded_word = tokenizer.encode(word, return_tensors="pt")
    output = model(encoded_word)
    return torch.mean(output.last_hidden_state, 1).tolist()[0]

In [80]:
print(get_embeddings("fromage"))

[-0.037858154624700546, 0.062335316091775894, 0.07900408655405045, -0.06916352361440659, -0.015011784620583057, 0.025408804416656494, -0.027349278330802917, 0.15314526855945587, 0.0275435671210289, 0.0227921511977911, -0.002132667927071452, 0.11103885620832443, -0.08612626045942307, 0.08913201093673706, 0.12978923320770264, 0.046276699751615524, 0.0910901129245758, -0.05799365043640137, 0.0911022201180458, -0.1303921341896057, 0.02576400525867939, -0.06178438290953636, 0.00634428858757019, -0.22462058067321777, 0.2469114512205124, -0.1348501294851303, 0.025041840970516205, -0.12917880713939667, -0.024989688768982887, 0.09192093461751938, -0.015060093253850937, -0.11628103256225586, 0.06101334095001221, 0.054161977022886276, 0.14580340683460236, -0.0981144830584526, 0.0005026459693908691, 0.10419734567403793, -0.05709323287010193, -0.13490383327007294, -0.08853445202112198, 0.07166898250579834, 0.14052028954029083, -0.018179625272750854, 0.11810598522424698, 0.16000281274318695, -0.1538

In [76]:
embedded_vocab = {}
for token in tqdm(tokenizer.get_vocab().keys()):
    embedded_vocab[token] = get_embeddings(token)
    break
print(embedded_vocab)

  0%|          | 0/32005 [00:00<?, ?it/s]

{'<s>NOTUSED': [-0.013191726990044117, 0.08629611879587173, 0.12257470935583115, -0.1352040320634842, -0.05540192499756813, 0.039461757987737656, 0.010911076329648495, 0.1480531543493271, 0.006371233146637678, 0.040329787880182266, 0.011856067925691605, 0.1589142233133316, -0.09061679989099503, 0.08979596942663193, 0.23230904340744019, 0.019494855776429176, 0.05265653505921364, -0.06231135129928589, 0.11351432651281357, -0.1555146425962448, 0.03972146287560463, -0.08810269832611084, -0.01307667326182127, -0.3714299201965332, 0.2608779966831207, -0.2082325965166092, -0.052114665508270264, -0.12008246779441833, -0.004816381726413965, 0.05595296621322632, 0.0408436544239521, -0.18877798318862915, 0.09432346373796463, 0.11581733077764511, 0.1610664576292038, -0.12378012388944626, -0.04619845747947693, 0.0908435508608818, -0.12524688243865967, -0.08025585860013962, -0.10422921925783157, 0.08487853407859802, 0.20593826472759247, -0.07006166130304337, 0.1256893426179886, 0.21790999174118042, 