In [None]:
!pip install sentence-transformers datasets

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import requests

In [None]:
import xml.etree.ElementTree as ET

url = "https://repositorium.sdum.uminho.pt/oai/oai"
col = "col_1822_21316"  # Coleção: MSc Tese DI

records = []

for n in range(0, 1000, 100):  # Ajuste o limite conforme necessário
    params = {
        "verb": "ListRecords",
        "resumptionToken": f"dim///{col}/{n}"
    }
    r = requests.get(url, params=params).text

    if "noRecordsMatch" in r:
        break

    try:
        root = ET.fromstring(r)
        list_records = root.find("{http://www.openarchives.org/OAI/2.0/}ListRecords")
        if list_records is not None:
            for record in list_records.findall("{http://www.openarchives.org/OAI/2.0/}record"):
                records.append(record)
    except ET.ParseError as e:
        print(f"Erro ao processar XML no ciclo {n}: {e}")
        continue

# Criar documento final bem-formado
root = ET.Element("root")
for rec in records:
    root.append(rec)

tree = ET.ElementTree(root)
tree.write("colecao.xml", encoding="utf-8", xml_declaration=True)

print("Coleção extraída e guardada em colecao.xml")

Coleção extraída e guardada em colecao.xml


In [None]:
import xml.etree.ElementTree as ET
import json

data = []

tree = ET.parse('colecao.xml')
root = tree.getroot()

ns = {
    'oai': 'http://www.openarchives.org/OAI/2.0/',
    'dim': 'http://www.dspace.org/xmlns/dspace/dim'
}

# Para cada registo (record) no XML
for record in root.findall('.//oai:record', ns):
    file_info = {}
    dim = record.find('.//dim:dim', ns)
    if dim is None:
        continue
    fields = dim.findall('dim:field', ns)
    # Extrair keywords (subject)
    file_info["keywords"] = [f.text.strip() for f in fields if f.get('element') == 'subject' and f.text]
    # Extrair título
    file_info["titulo"] = next((f.text.strip() for f in fields if f.get('element') == 'title' and f.text), "")
    # Extrair autor
    file_info["autor"] = next((f.text.strip() for f in fields if f.get('element') == 'contributor' and f.get('qualifier') == 'author' and f.text), "")
    # Extrair data
    file_info["data"] = next((f.text.strip() for f in fields if f.get('element') == 'date' and f.text), "")
    # Extrair resumo (abstract)
    file_info["abstract"] = next((f.text.strip() for f in fields if f.get('element') == 'description' and f.get('qualifier') == 'abstract' and f.text), "")
    data.append(file_info)

with open('ColDoc.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
    data.append(file_info)

In [None]:
data[:10]

[{'keywords': ['Energy efficiency',
   'Green software',
   'Web browsers',
   'WebAssembly',
   'Eficiência energética',
   'Navegadores web',
   'Software Verde',
   'Engenharia e Tecnologia::Engenharia Eletrotécnica, Eletrónica e Informática'],
  'titulo': 'On the performance of WebAssembly',
  'autor': 'Macedo, João Gonçalves de',
  'data': '2022-04-01',
  'abstract': 'The worldwide Web has dramatically evolved in recent years. Web pages are dynamic, expressed by pro grams written in common programming languages given rise to sophisticated Web applications. Thus,\nWeb browsers are almost operating systems, having to interpret/compile such programs and execute\nthem. Although JavaScript is widely used to express dynamic Web pages, it has several shortcomings and\nperformance inefficiencies. To overcome such limitations, major IT powerhouses are developing a new\nportable and size/load efficient language: WebAssembly.\nIn this dissertation, we conduct the first systematic study on th

In [None]:
# 2. Calcular frequência de cada keyword
import json
from itertools import combinations
from collections import Counter
from sklearn.utils import resample
from datasets import Dataset

with open("ColDoc.json", encoding="utf-8") as f:
    docs = json.load(f)

all_keywords = [k for d in docs for k in d.get("keywords", [])]
keyword_freq = Counter(all_keywords)
N = len(docs)

def guess_sim(d1, d2, freq, N):
    k1 = set(d1.get("keywords", []))
    k2 = set(d2.get("keywords", []))
    if not k1 or not k2:
        return 0.0
    # Jaccard
    inter = k1 & k2
    union = k1 | k2
    jaccard = len(inter) / len(union)
    # Penalizar keywords triviais (muito frequentes)
    raridade = sum(1/(freq[k]) for k in inter) if inter else 0
    # Exemplo: bónus se partilham UDC/FOS (keywords com "::")
    bonus = 0.1 if any("::" in k for k in inter) else 0
    return jaccard + raridade + bonus

In [None]:
from  itertools import combinations

pairs = []
for d1, d2 in combinations(docs, 2):
    sim = guess_sim(d1, d2, keyword_freq, N)
    pairs.append((d1["abstract"], d2["abstract"], sim))

#[(tex1, tex2, score)]
print(pairs[:10])

[('The worldwide Web has dramatically evolved in recent years. Web pages are dynamic, expressed by pro grams written in common programming languages given rise to sophisticated Web applications. Thus,\nWeb browsers are almost operating systems, having to interpret/compile such programs and execute\nthem. Although JavaScript is widely used to express dynamic Web pages, it has several shortcomings and\nperformance inefficiencies. To overcome such limitations, major IT powerhouses are developing a new\nportable and size/load efficient language: WebAssembly.\nIn this dissertation, we conduct the first systematic study on the energy and run-time performance\nof WebAssembly and JavaScript on the Web. We used micro-benchmarks and real applications to have\nmore realistic results. The results show that WebAssembly, while still in its infancy, is starting to already\noutperform JavaScript, with much more room to grow. A statistical analysis indicates that WebAssembly\nproduces significant perfo

In [None]:
filtered_pairs = [p for p in pairs if p[2] > 0.2]


In [None]:
len(pairs), len(data)

(499500, 1001)

In [None]:
from collections import Counter

scores = Counter([score for _,_, score in pairs])
print(scores)

Counter({0.0: 294370, 0.16043903829706357: 13189, 0.16411550888529886: 12516, 0.16828217555196553: 12097, 0.15717106444085444: 11427, 0.1542470878326673: 10880, 0.1730440803138703: 10776, 0.17853858580837578: 10720, 0.21272661999640996: 10376, 0.1925245997943898: 9043, 0.15161550888529887: 8571, 0.1849488422186322: 8356, 0.2016155088852989: 8076, 0.24447265174244173: 8028, 0.1492345565043465: 7486, 0.22661550888529888: 7112, 0.14707005433984432: 5790, 1.101615508885299: 5356, 0.1450937697548641: 4655, 0.26828217555196554: 3976, 0.14328217555196554: 3395, 0.1416155088852989: 2598, 0.30161550888529887: 2184, 0.14007704734683735: 1829, 0.13865254592233592: 1496, 0.3516155088852989: 1185, 1.1074074074074074: 1176, 0.13732979459958458: 1109, 0.13609826750598852: 1002, 0.6016155088852988: 942, 0.2185185185185185: 801, 0.1349488422186322: 753, 0.20740740740740743: 704, 0.23240740740740742: 643, 0.13387357340142791: 623, 0.19831649831649834: 588, 0.1699074074074074: 546, 0.13286550888529888: 4

In [None]:
from sklearn.utils import resample
majority_class = [pair for pair in pairs if pair[2] == 0]
minority_class = [pair for pair in pairs if pair[2] != 0]


undersampled_majority_class = resample(majority_class,
replace=False, #Don´t duplicate samples
n_samples= len(minority_class), #Match minority
random_state=42)

balanced_pairs = undersampled_majority_class + minority_class


score_counter = Counter([score for _,_, score in balanced_pairs])
score_counter

Counter({0.0: 205130,
         0.22661550888529888: 7112,
         0.14707005433984432: 5790,
         0.1849488422186322: 8356,
         0.16411550888529886: 12516,
         0.15161550888529887: 8571,
         0.17853858580837578: 10720,
         0.15717106444085444: 11427,
         0.16043903829706357: 13189,
         0.1542470878326673: 10880,
         0.1730440803138703: 10776,
         0.1925245997943898: 9043,
         0.16828217555196553: 12097,
         0.1416155088852989: 2598,
         0.21272661999640996: 10376,
         0.41272661999641: 9,
         0.2016155088852989: 8076,
         0.13387357340142791: 623,
         0.14328217555196554: 3395,
         0.12487132283878724: 60,
         0.7823847396545297: 3,
         0.1349488422186322: 753,
         0.1450937697548641: 4655,
         0.1492345565043465: 7486,
         0.8516155088852989: 1,
         0.14007704734683735: 1829,
         0.13732979459958458: 1109,
         0.13609826750598852: 1002,
         0.46828217555196

In [None]:
#Normalizar scores - esta normalização é cega, mas no tp temos de pensar como normalizar

def normalize_score(score):
    if score == 0:
        return 0
    if score < 0.5:
        return 0.5
    if score < 0.8:
        return 0.8
    return 1

balanced_pairs_norm = [(t1,t2, normalize_score(score)) for t1,t2, score in balanced_pairs]


print(balanced_pairs_norm[1300])

('The constant growth of high-throughput data generation and omics approaches require\ninformatics support and (semi) automated processes to be developed. With increasing number\nof sequenced genomes available, metabolic engineering processes will allow a rational alteration\nof the genetic architecture to achieve specific phenotypes. These alterations will allow\nto generate and optimize features of some organisms with economic and health interest.\nLactobacillus helveticus is an important industrial lactic-acid bacterium being used in\nthe production of several types of cheese. The metabolic activities of the bacterium contribute\nto the cheese flavour and reduce bitterness. Lb. helveticus is a growing body of literature on\nthe health-promoting properties of its various strains and generally accepted as probiotic for\nits anti-mutagenic, immunomodulatory and anti-diarrheal effects.\nThe aim of this project was to reconstruct a genome-scale metabolic network of Lb. helveticus\nCNRZ32

In [None]:
#estratificação

from sklearn.model_selection import train_test_split

scores = [score for _, _, score, *_ in balanced_pairs_norm]

train_data, test_data = train_test_split(
balanced_pairs_norm,
test_size=0.2,
random_state=42,
stratify=scores
)

score_train = Counter([score for _,_, score in train_data])
score_test = Counter([score for _,_, score in test_data])

print(score_train)
print(score_test)



Counter({0: 164104, 0.5: 156903, 1: 5730, 0.8: 1471})
Counter({0: 41026, 0.5: 39226, 1: 1433, 0.8: 367})


In [None]:
from datasets import Dataset

def create_dataset(data):

  result = {
      "text1":[],
      "text2":[],
      "score":[]
  }

  for text1, text2, score in data:
    result["text1"].append(text1)
    result["text2"].append(text2)
    result["score"].append(score)
  return result

train_dataset = Dataset.from_dict(create_dataset(train_data))
test_dataset = Dataset.from_dict(create_dataset(test_data))





In [None]:
train_dataset, test_dataset

(Dataset({
     features: ['text1', 'text2', 'score'],
     num_rows: 328208
 }),
 Dataset({
     features: ['text1', 'text2', 'score'],
     num_rows: 82052
 }))

In [None]:
from sentence_transformers import SentenceTransformer, losses

model= SentenceTransformer('neuralmind/bert-base-portuguese-cased')
loss = losses.CosineSimilarityLoss(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#Model training

from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# 2. Definir argumentos de treino
args = SentenceTransformerTrainingArguments(
    output_dir="my_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)





In [None]:
# Create the evaluator
dev_evaluator = EmbeddingSimilarityEvaluator(
test_dataset['text1'], # Assuming these are the sentence pairs for evaluation
test_dataset['text2'],
test_dataset['score'], # Assuming this contains the similarity scores
main_similarity=SimilarityFunction.COSINE,
)
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
loss=loss,
evaluator=dev_evaluator,
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["text1"],
    sentences2=test_dataset["text2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
)
test_evaluator(model)


In [None]:


# 8. Save the trained model
model.save_pretrained("directory_path")

In [None]:
data = []

In [None]:
abstracts = [entry["abstract"] for entry in data]
titles = [entry["titulo"] for entry in data]
len(abstracts), len(titles)

In [None]:
embeddings = model.encode(abstracts, convert_to_tensor=True)

In [None]:
query_text = "As organizações de saúde têm como principal objectivo a prestação de\nserviços de qualidade à população, e a tomada de decisões de forma rápida\ne e caz é essencial para que tais objectivos sejam atingidos. Deste modo,\nneste sector, a adopção de ferramentas tecnológicas automatizadas que facilitam\neste processo tem vindo a aumentar ao longo dos anos. Neste contexto,\nsurge o conceito de Business Intelligence (BI) que auxilia a tomada de decisão\npor parte dos pro ssionais de saúde, uma vez que estes sistemas se baseiam\nna Extracção de Conhecimento (EC) gerado pelos sistemas de informação\ntransaccionais, sendo capazes de integrar uma enorme quantidade de dados\nprovenientes de diversas fontes, normalmente de bases de dados que se encontram\nem diferentes tecnologias, plataformas e totalmente desintegradas.\nAssim, ultrapassando-se a heterogeneidade das bases de dados, através da\nestruturação dos dados, extrai-se informação que permitirá atingir conhecimento\nimportante para as decisões clínicas.\nEspeci camente, a Unidade de Cuidados Intensivos (UCI) de um hospital\né a unidade mais cara e que mais recursos exige, de tal forma que os sistemas\nde BI podem desempenhar um papel preponderante não só na racionalização\ndos custos, mas também na melhoria da qualidade dos cuidados prestados,\natravés da monitorização dos dados clínicos dos pacientes. Deste modo, este\nprojecto pioneiro incidiu na análise da aplicação do Pentaho, um software\nOpen-Source (OS) de BI, nos processos de EC a estas unidades hospitalares,\ntendo como fonte os dados dos pacientes de um hospital localizado no Norte\nde Portugal, avaliando o conhecimento obtido e o seu impacto na tomada de\ndecisão.\nEste software disponibiliza ferramentas que analisam, sintetizam, assimilam\ne dão sentido às enormes quantidades de informação, sendo capaz\nde estabelecer ligações so sticadas e discernir padrões, dando oportunidade\npara tirar conclusões e agir de forma preventiva."


In [None]:
from sentence_transformers import util
import torch

query_embedding = model.encode(query_text, convert_to_tensor=True)

# Calculate the similarity between the query and the abstracts
cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)
retrieval_results = torch.topk(cosine_scores, k=15)

for score, idx in zip(retrieval_results.values[0], retrieval_results.indices[0]):

    print(f"Title: {titles[idx]}\nSimilarity Score: {score.item():.4f}\nSentence: {abstracts[idx]} \n")
    print("-" * 80)