In [2]:
pip install torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (27 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl (175.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.8/175.8 MB[0m [31m126.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torch
Successfully installed torch-2.7.0+cpu
Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.31.1-py3-none-any.whl (484 kB)
Downloadin

In [4]:
import json
import os
import torch
import pickle
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from typing import List, Tuple

In [5]:
# ----------------------------
# CONFIGURATION
# ----------------------------

MODEL_NAME = "facebook/bart-large-mnli"  # Lightweight zero-shot LLM
USE_GPU = torch.cuda.is_available()
TOP_K = 3
MAX_TOKENS = 512

In [6]:
# ----------------------------
# SDG CANDIDATE LABELS (flattened)
# ----------------------------

sdg_labels = [
    "No Poverty",
    "Zero Hunger",
    "Good Health and Well-being",
    "Quality Education",
    "Gender Equality",
    "Clean Water and Sanitation",
    "Affordable and Clean Energy",
    "Decent Work and Economic Growth",
    "Industry, Innovation and Infrastructure",
    "Reduced Inequalities",
    "Sustainable Cities and Communities",
    "Responsible Consumption and Production",
    "Climate Action",
    "Life Below Water",
    "Life on Land",
    "Peace, Justice and Strong Institutions",
    "Partnerships for the Goals"
]


In [7]:
# ----------------------------
# LOAD AND PARSE INPUT FILE
# ----------------------------

def load_dat_file(path: str) -> List[dict]:
    documents = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                ep, text = line.strip().split('\t', 1)
                documents.append({'ep': ep.strip(), 'text': text.strip()})
            except ValueError:
                continue
    return documents

In [8]:
# ----------------------------
# CLASSIFIER FUNCTION
# ----------------------------

def classify_with_llm(text: str, labels: List[str], classifier, top_k: int = 3) -> List[Tuple[str, float]]:
    result = classifier(text, candidate_labels=labels, multi_label=True)
    paired = list(zip(result['labels'], result['scores']))
    return sorted(paired, key=lambda x: -x[1])[:top_k]


In [9]:
# ----------------------------
# MAIN FUNCTION
# ----------------------------

def run_sdg_classification(input_path, goal_path, output_path):
    # Load SDG goals from pickle
    with open(goal_path, 'rb') as f:
        sdg_df = pickle.load(f)
    sdg_labels = sdg_df.iloc[:, 0].tolist()  # assumes SDG names are in first column

    # Load patent texts
    data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if "\t" in line:
                ep, text = line.strip().split("\t", 1)
                data.append({"ep": ep, "text": text})

    # Load model
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if torch.cuda.is_available() else -1)

    # Classify each text
    results = []
    for doc in data:
        text = " ".join(doc["text"].split()[:512])  # truncate long text
        pred = classifier(text, candidate_labels=sdg_labels, multi_label=True)
        results.append({
            "ep": doc["ep"],
            "text": doc["text"],
            "predictions": list(zip(pred["labels"], pred["scores"]))
        })

    # Save results
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(f" Classification done. Results saved to {output_path}")

### EXAMPLE RUN

In [16]:
import gzip
def load_list(filename):
    """
    Loads a gzipped JSON Lines (jsonl) file and returns a list of dictionaries.

    Parameters:
        filename (str): The filename of the gzipped jsonl file.

    Returns:
        list: A list of dictionaries read from the file.
    """
    result = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in f:
            result.append(json.loads(line))
    return result

In [20]:
texts=load_list("test.dat.gz")
with open("test1.dat", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text["id"]+"\t"+text["text"]+"\n")
        break

In [23]:
run_sdg_classification("test1.dat", "sgd_goals.dat", "results.json")

Device set to use cpu


 Classification done. Results saved to results.json


In [24]:
import json

with open("results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

for entry in results:
    print(f"\nPatent: {entry['ep']}")
    for goal, score in entry['predictions']:
        print(f" - {goal}: {score:.2f}")


Patent: EP3268034A220180117
 - Reduced Inequalities: 0.16
 - Partnerships for the Goals: 0.07
 - Good Health and Well-being: 0.06
 - Responsible Consumption and Production: 0.03
 - Life on Land: 0.02
 - No Poverty: 0.01
 - Life Below Water: 0.01
 - Decent Work and Economic Growth: 0.01
 - Zero Hunger: 0.00
 - Industry, Innovation and Infrastructure: 0.00
 - Sustainable Cities and Communities: 0.00
 - Quality Education: 0.00
 - Climate Action: 0.00
 - Gender Equality: 0.00
 - Peace, Justice and Strong Institutions: 0.00
 - Affordable and Clean Energy: 0.00
 - Clean Water and Sanitation: 0.00
