In [9]:
import pickle 
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModel
from nltk.util import ngrams 
from collections import Counter
import regex as re 
import torch 
import torch.nn.functional as F
from functools import lru_cache
import spacy
from nltk.corpus import stopwords
from numpy import dot
from numpy.linalg import norm
import functorch
import numpy as np 
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler

In [10]:
device = 'cuda'

In [11]:
all_content = pickle.load(open('all_data.pkl', 'rb'))

In [4]:
ngram_weights = [0.2, 0.8]

In [229]:
lang_nlp_map = {
    'english': 'en_core_web_sm',
    'german': 'de_core_news_sm',
    'portuguese': 'pt_core_news_sm',
}

lang_name_map = {
    'en': 'english',
    'de': 'german',
    'pt': 'portuguese',
}

In [21]:
class TfTokenizer:
    def __init__(self, lang):
        self.nlp = spacy.load(lang_map[lang])
        self.stopwords = stopwords.words(lang)
    def __call__(self, doc):
        doc = self.nlp(doc)
        return [token.lemma_ for token in doc if token.text not in self.stopwords]

def cosine_similarity(a, b):
    if dot(a, b)==0 or norm(a)==0 or norm(b)==0:
        return 0
    return dot(a, b)/(norm(a)*norm(b))

def compute_cosine_similarity(A, B):
    # Normalize rows of matrices A and B
    A_normalized = A / np.linalg.norm(A, axis=1)[:, np.newaxis]
    B_normalized = B / np.linalg.norm(B, axis=1)[:, np.newaxis]

    # Compute cosine similarity using dot product
    similarity_matrix = np.dot(A_normalized, B_normalized.T)

    return similarity_matrix

In [300]:
class TFScorer:
    def __init__(self, sents, obj_properties, sub_properties, language):
        self.sents = sents
        self.obj_properties = obj_properties
        self.sub_properties = sub_properties
        self.language = language
        self.ngram_weights = [0.2, 0.8]
        self.nlp = spacy.load(lang_nlp_map[self.language])
        self.vectorizer = CountVectorizer(tokenizer=TfTokenizer(self.language), ngram_range=(1, 1))

        self.processed_sents = [self._preprocess(sent) for sent in self.sents]
        self.processed_obj_properties = [self._preprocess(prop) for prop in self.obj_properties]
        self.processed_sub_properties = [self._preprocess(prop) for prop in self.sub_properties]    
        self._fit_vectorizer()

    def _preprocess(self, sent):
        return " ".join([token.lemma_.lower() for token in self.nlp(sent) if token.text not in stopwords.words(self.language)])
    
    def _fit_vectorizer(self):
        # all_sents = [sent for para in self.processed_sents for sent in para]
        # all_properties = [prop for para in self.processed_properties for prop in para]
        self.vectorizer.fit(self.sents + self.obj_properties + self.sub_properties)

    def _tf_scores(self):
        sent_vectors = self.vectorizer.transform(self.processed_sents).toarray()
        if(len(self.processed_obj_properties) > 0):
            obj_prop_vectors = self.vectorizer.transform(self.processed_obj_properties).toarray()
            obj_similarity_matrix = compute_cosine_similarity(sent_vectors, obj_prop_vectors)
        else:
            obj_similarity_matrix = [[] for _ in range(len(sent_vectors))]
        
        if(len(self.processed_sub_properties) > 0):
            sub_prop_vectors = self.vectorizer.transform(self.processed_sub_properties).toarray()
            sub_similarity_matrix = compute_cosine_similarity(sent_vectors, sub_prop_vectors)
        else:
            sub_similarity_matrix = [[] for _ in range(len(sent_vectors))]
            
        return obj_similarity_matrix, sub_similarity_matrix
    
    def compute_similarity(self):
        return self._tf_scores()

In [None]:
for lang in all_content:
    for title in list(all_content[lang].keys()):
        print(lang, title)
        if(lang == 'en'):
            sents = [sent for para in all_content[lang][title]['len_filtered_sentences'] for sent in para]
            sent_src = [i for i, para in enumerate(all_content[lang][title]['len_filtered_sentences']) for sent in para]
            obj_properties = [" ".join(fc) for fc in all_content[lang][title]['object_properties']]
            sub_properties = [" ".join(fc) for fc in all_content[lang][title]['subject_properties']]
            tf_scorer = TFScorer(sents, obj_properties, sub_properties, lang_name_map[lang])
            obj_similarity_matrix, sub_similarity_matrix = tf_scorer.compute_similarity()
            obj_para_scores = []
            sub_para_scores = []
            for i, obj_score, sub_score in zip(sent_src, obj_similarity_matrix, sub_similarity_matrix):
                if(i >= len(obj_para_scores)):
                    obj_para_scores.append([])
                    sub_para_scores.append([])
                obj_para_scores[i].append(obj_score)
                sub_para_scores[i].append(sub_score)
            all_content[lang][title]['obj_syn_scores'] = obj_para_scores
            all_content[lang][title]['sub_syn_scores'] = sub_para_scores

        else:
            # sents = [sent for para in all_content[lang][title]['len_filtered_sentences'] for sent in para]
            # sent_src = [i for i, para in enumerate(all_content[lang][title]['len_filtered_sentences']) for sent in para]
            # trans_obj_properties = [" ".join(fc[1:]) for fc in all_content[lang][title]['translated_object_properties']]
            # trans_sub_properties = [" ".join(fc[:-1]) for fc in all_content[lang][title]['translated_subject_properties']]

            # tf_scorer = TFScorer(sents, trans_obj_properties, trans_sub_properties, lang_name_map[lang])
            # obj_similarity_matrix, sub_similarity_matrix = tf_scorer.compute_similarity()

            # obj_para_scores = []
            # sub_para_scores = []
            # for i, obj_score, sub_score in zip(sent_src, obj_similarity_matrix, sub_similarity_matrix):
            #     if(i >= len(obj_para_scores)):
            #         obj_para_scores.append([])
            #         sub_para_scores.append([])
            #     obj_para_scores[i].append(obj_score)
            #     sub_para_scores[i].append(sub_score)
            # all_content[lang][title]['obj_syn_scores'] = obj_para_scores
            # all_content[lang][title]['sub_syn_scores'] = sub_para_scores

            trans_sents = [sent for para in all_content[lang][title]['translated_sents'] for sent in para]
            obj_properties = [" ".join(fc) for fc in all_content[lang][title]['object_properties']]
            sub_properties = [" ".join(fc) for fc in all_content[lang][title]['subject_properties']]

            tf_scorer = TFScorer(trans_sents, obj_properties, sub_properties, 'english')
            obj_similarity_matrix, sub_similarity_matrix = tf_scorer.compute_similarity()
            obj_para_scores = []
            sub_para_scores = []
            for i, obj_score, sub_score in zip(sent_src, obj_similarity_matrix, sub_similarity_matrix):
                if(i >= len(obj_para_scores)):
                    obj_para_scores.append([])
                    sub_para_scores.append([])
                obj_para_scores[i].append(obj_score)
                sub_para_scores[i].append(sub_score)
            all_content[lang][title]['obj_syn_scores'] = obj_para_scores
            all_content[lang][title]['sub_syn_scores'] = sub_para_scores

In [3]:
syntactic_scores = pickle.load(open('syn_scores_dump.pkl', 'rb'))
syntactic_scores_l = pickle.load(open('non_eng_syn_scores_dump.pkl', 'rb'))

In [4]:
filtered_entities = {'en': {}, 'pt': {}, 'de': {}}

In [5]:
for lang in syntactic_scores:
    for title in syntactic_scores[lang]:
        if('sub_syn_scores' in syntactic_scores[lang][title] or 'obj_syn_scores' in syntactic_scores[lang][title]):
            filtered_entities[lang][title] = syntactic_scores[lang][title]

for lang in syntactic_scores_l:
    for title in syntactic_scores_l[lang]:
        if('sub_syn_scores' in syntactic_scores_l[lang][title] or 'obj_syn_scores' in syntactic_scores_l[lang][title]):
            filtered_entities[lang][title] = syntactic_scores_l[lang][title]

In [6]:
count = 0 
for lang in filtered_entities:
    for title in filtered_entities[lang]:
        try:
            candidate_obj = [] 
            candidate_sub = [] 
            sents = filtered_entities[lang][title]['len_filtered_sentences']
            obj_props = filtered_entities[lang][title]['object_properties']
            sub_props = filtered_entities[lang][title]['subject_properties']
            for p, para_scores in enumerate(filtered_entities[lang][title]['obj_syn_scores']):
                for i, scores in enumerate(para_scores):
                    valid_scores = [(obj_props[i], score) for i, score in enumerate(scores) if score>0.2]
                    if(len(valid_scores) > 0):
                        candidate_obj.append({'sent': sents[p][i], 'props': valid_scores})

            for p, para_scores in enumerate(filtered_entities[lang][title]['sub_syn_scores']):
                for i, scores in enumerate(para_scores):
                    valid_scores = [(sub_props[i], score) for i, score in enumerate(scores) if score>0.2]
                    if(len(valid_scores) > 0):
                        candidate_sub.append({'sent': sents[p][i], 'props': valid_scores})
            filtered_entities[lang][title]['candidate_obj'] = candidate_obj
            filtered_entities[lang][title]['candidate_sub'] = candidate_sub
        except Exception as e:
            print(lang, title)

pt McDonnell XF-85 Goblin
pt São Paulo
pt History of Botafogo de Futebol e Regatas
pt Alberto Henschel
pt Frank Headlam
pt Albert, Prince Consort
de Boeing 747
de Dassault Rafale
de Bombing of Dresden in World War II
de Fortifications of Frankfurt
de Leipzig
de Altes Stadthaus, Berlin
de Minardi
de San Jose Sharks
de Stade de Reims
de Walter Benjamin
de Lewis Carroll
de Richard Foerster (classical scholar)


In [7]:
actual_final_dump = {}
for lang in filtered_entities:
    for title in filtered_entities[lang]:
        if('candidate_obj' in filtered_entities[lang][title] or 'candidate_sub' in filtered_entities[lang][title]):
            actual_final_dump[lang] = filtered_entities[lang]

In [8]:
candidate_dump = open('candidates.pkl', 'wb')
pickle.dump(actual_final_dump, candidate_dump)
candidate_dump.close()

In [116]:
filtered_entities['de'].keys()

dict_keys(['Verbal Behavior', 'The Devil to Pay in the Backlands', "Plato's unwritten doctrines", 'Bradley Fighting Vehicle', 'Rockwell B-1 Lancer', 'Boeing 747', 'Flakpanzer Gepard', 'Dassault Rafale', 'Beetzsee (municipality)', 'U2 (Berlin U-Bahn)', 'Bhaktapur', 'Südbrookmerland', 'Moormerland', 'Bombing of Dresden in World War II', 'Toronto', 'Fribourg', 'Fortifications of Frankfurt', 'Leipzig', 'Hattusa', 'Poverty Point', 'Altes Stadthaus, Berlin', 'Buried Pyramid', 'Salzhaus', "Watkin's Tower", 'Rötteln Castle', "St. Catherine's Church, Frankfurt", 'Palace of Westminster', 'Saint Cyriakus, Gernrode', 'Church of the Redeemer, Sacrow', 'Goetheanum', 'Minardi', 'France national football team', 'Blue Angels', 'San Jose Sharks', 'Switzerland national football team', 'FLN football team', 'Andrea Moda Formula', "France women's national football team", 'Stade de Reims', 'Walter Benjamin', 'Apollo 8', 'Munir Bashir', 'Philo of Larissa', 'Henry the Fowler', 'Hilde Zimmermann', 'Clara Schuma

In [127]:
filtered_entities['de']['Clara Schumann']['candidate_obj']

[{'sent': 'Sie war von 1840 bis zu dessen Tod 1856 die Ehefrau Robert Schumanns.',
  'props': [(('child', 'Eugenie Schumann'), 0.2182178902359924),
   (('death place', 'Frankfurt'), 0.2182178902359924),
   (('spouse', 'Robert Schumann'), 0.4364357804719848)]},
 {'sent': '(Kaufmannswitwe).Ihre Eltern waren Friedrich Wieck (1785–1873) und Mariane Wieck geb. Tromlitz (1797–1872).',
  'props': [(('father', 'Friedrich Wieck'), 0.33968311024337877)]},
 {'sent': 'Claras Mutter war die Tochter eines Kantors.',
  'props': [(('mother', 'Mariane Bargiel'), 0.2182178902359924)]},
 {'sent': 'Friedrich Wieck nannte sie in seinen Schriften gerne neben Clara und Marie als eine seiner „drei Töchter“.Die Mutter Mariane Wieck schloss schon 1825 eine zweite Ehe mit dem Klavierlehrer Adolph Bargiel und zog mit ihm nach Berlin, wo sie weiter als Klavierlehrerin tätig war.',
  'props': [(('mother', 'Mariane Bargiel'), 0.2847473987257497),
   (('father', 'Friedrich Wieck'), 0.2847473987257497)]},
 {'sent': 'M

In [119]:
len(filtered_entities['de']['Buried Pyramid']['translated_sents'])

14

In [80]:
len(filtered_entities['pt']['A Vindication of the Rights of Men']['obj_syn_scores'][4])

21

## Semantic Similarity

In [12]:
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", padding='max_length', truncation='max_length', max_length=512)
model = AutoModel.from_pretrained("xlm-roberta-base", output_hidden_states=True).to(device)
model.eval()

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0): XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     

In [13]:
all_content['en'].keys()

dict_keys(['Darkness Visible (memoir)', 'Verbal Behavior', 'The Devil to Pay in the Backlands', "Plato's unwritten doctrines", 'Airbus A320 family', 'Transall C-160', 'Boeing KC-135 Stratotanker', 'Bradley Fighting Vehicle', 'Rockwell B-1 Lancer', 'Boeing B-52 Stratofortress', 'Landing Vehicle Tracked', 'Boeing 737', 'Soyuz (spacecraft)', 'Flakpanzer Gepard', 'Dassault Rafale', 'Canberra', 'Beetzsee (municipality)', 'U2 (Berlin U-Bahn)', 'Kleinmachnow', 'Bhaktapur', 'Südbrookmerland', 'São Paulo', 'Memmingen', 'Wetzlar', 'Bombing of Dresden in World War II', 'Norden, Lower Saxony', 'Toronto', 'Schwieberdingen', 'Eberswalde', 'Freiburg im Breisgau', 'Fortifications of Frankfurt', 'Leipzig', 'Hattusa', 'Limes Germanicus', 'Poverty Point', 'Altes Stadthaus, Berlin', 'Federal Palace of Switzerland', 'Buried Pyramid', 'Salzhaus', "Watkin's Tower", 'Rötteln Castle', "St. Catherine's Church, Frankfurt", 'St. Leonhard, Frankfurt', 'Saint Cyriakus, Gernrode', 'Reichstag building', 'Church of th

In [14]:
sents = [sent for para in all_content['en']['Clara Schumann']['len_filtered_sentences'] for sent in para]
sents_batched = [sents[i:i+32] for i in range(0, len(sents), 32)]
sent_src = [i for i, para in enumerate(all_content['en']['Clara Schumann']['len_filtered_sentences']) for sent in para]
#obj_properties = [" ".join(fc) for fc in all_content['en']['Clara Schumann']['object_properties']]
obj_properties = all_content['en']['Clara Schumann']['object_properties']

In [15]:
props_tokenized = tokenizer(obj_properties, padding='max_length', truncation='longest_first', max_length=64, return_tensors="pt")
with torch.no_grad():
    prop_outputs = model(**props_tokenized.to(device), output_hidden_states=True).last_hidden_state[:, 0, :].cpu().numpy()

In [16]:
all_sent_outpus = []
for sent_batch in sents_batched:
    sents_tokenized = tokenizer(sent_batch, padding='max_length', truncation='longest_first', max_length=256, return_tensors="pt")
    with torch.no_grad():
        sent_outputs = model(**sents_tokenized.to(device), output_hidden_states=True).last_hidden_state[:, 0, :].cpu().numpy()
    all_sent_outpus.append(sent_outputs)

In [17]:
final_sent_outputs = np.concatenate(all_sent_outpus, axis=0)

In [19]:
prop_outputs.shape, final_sent_outputs.shape

((10, 768), (557, 768))

In [22]:
out = compute_cosine_similarity(final_sent_outputs, prop_outputs)

In [23]:
out

array([[0.8832458 , 0.9941124 , 0.9980223 , ..., 0.9966349 , 0.6399047 ,
        0.9916733 ],
       [0.83439755, 0.9985744 , 0.99510986, ..., 0.99736214, 0.5622595 ,
        0.99903464],
       [0.8615725 , 0.9969156 , 0.99741226, ..., 0.9976612 , 0.6045088 ,
        0.99592924],
       ...,
       [0.81098324, 0.99746126, 0.99073875, ..., 0.9946678 , 0.52754176,
        0.9990658 ],
       [0.87483054, 0.99549055, 0.9973259 , ..., 0.9969659 , 0.6260341 ,
        0.9930873 ],
       [0.8278709 , 0.99856275, 0.9940498 , ..., 0.9969541 , 0.5523938 ,
        0.99936473]], dtype=float32)

In [37]:
np.max(out, axis=1)

array([ 0.43649784,  0.75433946,  0.65712047,  0.7220949 ,  0.8815183 ,
        0.83193445,  0.8288648 ,  0.86397684,  0.7531442 ,  0.82266873,
        0.8180884 ,  0.82607967,  0.8824812 ,  0.75147825,  0.62459445,
        0.8487889 ,  0.8335014 ,  0.8918784 ,  0.6626229 ,  0.81221235,
        0.7844057 ,  0.72296655,  0.772118  ,  0.7392936 ,  0.28352898,
        0.8744571 ,  0.87382126,  0.8495346 ,  0.8386418 ,  0.7729629 ,
        0.7224885 ,  0.7668305 ,  0.5931993 ,  0.6536962 ,  0.85726404,
        0.78911126,  0.8393857 ,  0.73042846,  0.8227645 ,  0.8081233 ,
        0.74315995,  0.85102415,  0.78191704,  0.7664764 ,  0.8092071 ,
        0.46876103,  0.6497412 ,  0.8092092 ,  0.79263085,  0.86205363,
        0.5862675 ,  0.72781795,  0.7350776 ,  0.8853601 ,  0.7839751 ,
        0.85324496,  0.28509763,  0.8215014 ,  0.5419542 ,  0.7914689 ,
        0.8186127 ,  0.8728595 ,  0.86027384,  0.8536103 ,  0.6333502 ,
        0.6244161 ,  0.8040589 ,  0.7887145 ,  0.76509655,  0.73

In [46]:
sents[1]

'Regarded as one of the most distinguished pianists of the Romantic era, she exerted her influence over the course of a 61-year concert career, changing the format and repertoire of the piano recital by lessening the importance of purely virtuosic works.'

In [43]:
obj_properties

['birth place Leipzig',
 'mother Mariane Bargiel',
 'child Eugenie Schumann',
 'organization Hoch Conservatory',
 'birth place Kingdom of Saxony',
 'death place German Empire',
 'father Friedrich Wieck',
 'birth place German Confederation',
 'death place Frankfurt',
 'spouse Robert Schumann']

In [24]:
def group_duplicates(embeddings, lst, mean=True):
    output = [None for _ in range(len(set(lst)))]
    i = 0
    for idx, i in enumerate(lst):
        if(i!=None):
            if(output[i] == None):
                output[i] = embeddings[idx, :].reshape(1, -1)
            else:
                output[i] = torch.cat((output[i], embeddings[idx, :].reshape(1, -1)), dim=0)
    if(mean):
        for idx, val in enumerate(output):
            output[idx] = torch.mean(output[idx], dim=0).reshape(1, -1)
    return output

def get_embedding(tokens, split_into_words=False):
    #print(tokens)
    with torch.no_grad():
        tokenized_facts = tokenizer(tokens, padding=True, truncation=True, max_length=512, is_split_into_words=split_into_words, return_tensors="pt").to(device)
        #print(tokenizer.convert_ids_to_tokens(tokenized_facts['input_ids'][0]))
        states = model(**tokenized_facts).hidden_states
        output = torch.stack([states[i] for i in range(len(states))])
        output = output.squeeze()
        #print(output.shape)
        final_hidden_state = torch.mean(output[:, :, ...], dim=0)
        #final_hidden_state = output[-2, :, ...]
        #print(final_hidden_state.shape)
        return final_hidden_state[1:-1], tokenized_facts.word_ids()[1:-1]

def get_facts(facts, token_split=True):
    # words = source.split(" ")
    # #language = words[1]
    # facts = re.split(r'<[^>]*>', " ".join(words[3:]))
    # facts = [re.sub(r'_', ' ', facts[i].strip()) for i in range(len(facts))]
    #print(facts)
    if(token_split):
        return [re.sub(r'_', ' ', f) for i in range(len(facts)) for f in facts[i].split()], [i-1 for i, word in enumerate(facts) for _ in range(len(word.split()))]
    return facts, []

In [25]:
obj_properties

[('birth place', 'Leipzig'),
 ('mother', 'Mariane Bargiel'),
 ('child', 'Eugenie Schumann'),
 ('organization', 'Hoch Conservatory'),
 ('birth place', 'Kingdom of Saxony'),
 ('death place', 'German Empire'),
 ('father', 'Friedrich Wieck'),
 ('birth place', 'German Confederation'),
 ('death place', 'Frankfurt'),
 ('spouse', 'Robert Schumann')]

In [26]:
prop_embeddings = get_embedding(obj_properties[-3], split_into_words=True)[0].cpu().numpy()

In [27]:
sent_embeddings = get_embedding(sents[0], split_into_words=False)[0].cpu().numpy()

In [28]:
prop_embeddings.shape

(6, 768)

In [29]:
sent_embeddings.shape

(46, 768)

In [30]:
out = compute_cosine_similarity(sent_embeddings, prop_embeddings)

In [31]:
out 

array([[0.7391835 , 0.7406858 , 0.7678678 , 0.7198651 , 0.6861969 ,
        0.6977189 ],
       [0.710143  , 0.6890756 , 0.74616295, 0.7222139 , 0.7165734 ,
        0.7058627 ],
       [0.7354579 , 0.7286489 , 0.7790096 , 0.73944587, 0.73827523,
        0.7715398 ],
       [0.6518984 , 0.6793501 , 0.7032693 , 0.7054322 , 0.6577658 ,
        0.644021  ],
       [0.7161289 , 0.6977694 , 0.7874915 , 0.7079949 , 0.7472507 ,
        0.7685714 ],
       [0.7045235 , 0.70436865, 0.72396326, 0.70726687, 0.7139511 ,
        0.73379576],
       [0.6319495 , 0.6212567 , 0.63571453, 0.66207767, 0.63166183,
        0.6550281 ],
       [0.6208272 , 0.59059846, 0.60297227, 0.6122962 , 0.6024597 ,
        0.59901655],
       [0.6875282 , 0.7239932 , 0.71276534, 0.7115817 , 0.6957548 ,
        0.69163483],
       [0.6432053 , 0.6494852 , 0.6535554 , 0.6329782 , 0.6462632 ,
        0.6549972 ],
       [0.625592  , 0.65270776, 0.6762457 , 0.602066  , 0.6783338 ,
        0.633093  ],
       [0.7049912 , 0

In [51]:
np.mean(out, axis=1)

array([0.38488215, 0.39223468, 0.35703814, 0.37106943, 0.25315133,
       0.2246642 , 0.27041358, 0.19548784, 0.2465618 , 0.27960682,
       0.22591478, 0.20532565, 0.252273  , 0.18978477, 0.31102645,
       0.24231678, 0.31826073, 0.33132648, 0.33147523, 0.3211912 ,
       0.3361708 , 0.28258407, 0.29117566, 0.29419553, 0.2256258 ,
       0.24463859, 0.255471  , 0.27926904, 0.33433604, 0.36892757,
       0.341896  , 0.4544623 , 0.35577193, 0.35965937, 0.35946423,
       0.34404302, 0.34837943, 0.3527041 , 0.3696828 , 0.40810817],
      dtype=float32)

In [52]:
sents

['Clara Josephine Schumann ([ˈklaːʁa ˈʃuːman]; née Wieck; 13 September 1819 – 20 May 1896) was a German pianist, composer, and piano teacher.',
 'Regarded as one of the most distinguished pianists of the Romantic era, she exerted her influence over the course of a 61-year concert career, changing the format and repertoire of the piano recital by lessening the importance of purely virtuosic works.',
 'She also composed solo piano pieces, a piano concerto (her Op. 7), chamber music, choral pieces, and songs.',
 'She grew up in Leipzig, where both her father Friedrich Wieck and her mother Mariane were pianists and piano teachers.',
 'In addition, her mother was a singer.',
 'Clara was a child prodigy, and was trained by her father.',
 'She began touring at age eleven, and was successful in Paris and Vienna, among other cities.',
 'She married the composer Robert Schumann, and the couple had eight children.',
 'Together, they encouraged Johannes Brahms and maintained a close relationship w