# Imports

## Libraries

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
server_ip = os.getenv('SERVER_IP')
port = os.getenv('PORT')

BASE_URL = f"http://{server_ip}:{port}/v1"

In [2]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device detected")

Torch version: 2.2.0+cu121
CUDA available: True
CUDA device name: NVIDIA GeForce RTX 4070 SUPER


In [3]:
from torchtext.datasets import UDPOS
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
import openai
import json
import re

## Data

In [4]:
# Load the train, validation, and test datasets
train_iter, valid_iter, test_iter = UDPOS(root='.data', split=('train', 'valid', 'test'))

In [None]:
# Look at the first few items in the training set
for i, item in enumerate(test_iter):
    print(item)
    if i == 0:  # Print first 5 items
        break

[['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?'], ['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT'], ['WP', 'IN', 'NNP', 'VBD', 'IN', 'NNP', '.']]


## Models

### bert-pos

In [6]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Failed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of PyTorch and CUDA Toolkit are installed: CUDA_HOME environment variable is not set. Please s

In [7]:
outputs = pipeline("A test example")
print(outputs)

[{'entity': 'DT', 'score': 0.9997243, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, {'entity': 'NN', 'score': 0.9997472, 'index': 2, 'word': 'test', 'start': 2, 'end': 6}, {'entity': 'NN', 'score': 0.99973196, 'index': 3, 'word': 'example', 'start': 7, 'end': 14}]


### mixtral 8x7b

In [8]:
# Set the base URL to point to your running LM Studio server
openai.api_base = BASE_URL
openai.api_key = "dummy-key"

In [None]:
response = openai.Completion.create(
    model="mixtral-8x7b-instruct-v0.1",
    prompt="Answer this: <Hello, good morning!>",
    max_tokens=20
)
response["choices"][0]["text"].strip()

'Good morning!\n\nI am your virtual assistant for today. How can I assist'

# Mapping

In [10]:
# Extract sentences and ground truth POS tags from test_iter
sentences = [item[0] for item in test_iter]  # Extract tokenized sentences
ground_truth_tags = [item[1] for item in test_iter]  # Extract ground truth POS tags

example_sentence = sentences[0]
example_true_tag = ground_truth_tags[0]



In [None]:
example_true_tag

In [11]:
# Create a dictionary to map Penn Treebank tags to UDPOS tags
penn_to_udpos = {
    'O': 'X',
    '``': 'PUNCT',
    ',': 'PUNCT',
    ':': 'PUNCT',
    '.': 'PUNCT',
    "''": 'PUNCT',
    '$': 'SYM',
    '#': 'SYM',
    'CC': 'CCONJ',
    'CD': 'NUM',
    'DT': 'DET',
    'EX': 'PRON',
    'FW': 'X',
    'IN': 'ADP',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    '-LRB-': 'PUNCT',
    'LS': 'X',
    'MD': 'AUX',
    'NN': 'NOUN',
    'NNP': 'PROPN',
    'NNPS': 'PROPN',
    'NNS': 'NOUN',
    'PDT': 'DET',
    'POS': 'PART',
    'PRP': 'PRON',
    'PRP$': 'PRON',
    'RB': 'ADV',
    'RBR': 'ADV',
    'RBS': 'ADV',
    'RP': 'PART',
    '-RRB-': 'PUNCT',
    'SYM': 'SYM',
    'TO': 'PART',
    'UH': 'INTJ',
    'VB': 'VERB',
    'VBD': 'VERB',
    'VBG': 'VERB',
    'VBN': 'VERB',
    'VBP': 'VERB',
    'VBZ': 'VERB',
    'WDT': 'DET',
    'WP': 'PRON',
    'WP$': 'PRON',
    'WRB' : 'ADV'
}

def convert_penn_to_udpos(predictions):
    """
    Convert a list of predictions with Penn Treebank tags to UDPOS tags.

    :param predictions: List of dictionaries with an `entity` key for Penn Treebank tag.
                        Example: [{'entity':'DT', ...}, ...]
    
    :return: List of dictionaries with `entity` key converted to UDPOS tag.
             Example: [{'entity':'DET', ...}, ...]
    """
    # Ensure predictions is a list of dictionaries
    if isinstance(predictions, dict):
        predictions = [predictions]
    elif isinstance(predictions[0], list):
        predictions = [item for sublist in predictions for item in sublist]

    for prediction in predictions:
        penn_tag = prediction.get('entity')  # Safely get 'entity' key
        if penn_tag is not None:
            # Map Penn Treebank tag to UDPOS tag, defaulting to `X`
            prediction['entity'] = penn_to_udpos.get(penn_tag, "X")
    
    return predictions


In [12]:
outputs = pipeline(example_sentence)
udpos_outputs = convert_penn_to_udpos(outputs)
bert_prediction = [item['entity'] for item in udpos_outputs]
print(bert_prediction)

['PRON',
 'ADP',
 'PROPN',
 'VERB',
 'VERB',
 'VERB',
 'ADP',
 'PROPN',
 'PROPN',
 'PUNCT']

In [13]:
def pos_tag_sentence(sentence, track_responses=None):
    """
    Perform POS tagging on a sentence and keep track of raw and parsed responses.

    Parameters:
      - sentence (str): The input sentence to perform POS tagging on.
      - track_responses (list, optional): A list to store raw and parsed responses for tracking.

    Returns:
      - dict: Parsed JSON response or an error message if parsing fails.
    """
    # Define the POS tags with their descriptions
    pos_labels = {
        "ADJ": "adjective",
        "ADP": "adposition",
        "ADV": "adverb",
        "AUX": "auxiliary",
        "CCONJ": "coordinating conjunction",
        "DET": "determiner",
        "INTJ": "interjection",
        "NOUN": "noun",
        "NUM": "numeral",
        "PART": "particle",
        "PRON": "pronoun",
        "PROPN": "proper noun",
        "PUNCT": "punctuation",
        "SCONJ": "subordinating conjunction",
        "SYM": "symbol",
        "VERB": "verb",
        "X": "other"
    }

    # Construct the prompt with instructions for POS tagging
    prompt = (
        f"Perform POS tagging on the following sentence using these labels: {pos_labels}. "
        f"Output each word with its tag in JSON format.\n\nSentence: \"{sentence}\""
    )

    # Send request to the model
    response = openai.Completion.create(
        model="mixtral-8x7b-instruct-v0.1",
        prompt=prompt,
        max_tokens=-1
    )

    # Extract text from response
    raw_output = response["choices"][0]["text"].strip()

    # Preprocess and clean the raw output
    cleaned_output = preprocess_raw_output(raw_output)

    # Attempt to parse the cleaned output as JSON
    try:
        parsed_json = json.loads(cleaned_output)
    except json.JSONDecodeError:
        parsed_json = {"error": "Response could not be parsed as JSON", "raw_output": cleaned_output}

    # Track responses if a tracking list is provided
    if track_responses is not None:
        track_responses.append({"raw_response": raw_output, "parsed_json": parsed_json})

    return parsed_json


def preprocess_raw_output(raw_output):
    """
    Preprocess and clean the raw output to make it valid JSON.
    
    - Fix common issues like invalid JSON syntax.
    
    Parameters:
      raw_output (str): The raw output from the model.
      
    Returns:
      str: The cleaned output.
    """
    # Replace invalid JSON entries (e.g., `{".", "."}` -> `{".": "."}`)
    cleaned_output = raw_output.replace('{".", "."}', '{".": "."}')
    
    # Additional cleaning logic can go here if needed
    return cleaned_output


# Example usage
sentence = example_sentence
responses_log = []

pos_tags_json = pos_tag_sentence(sentence, track_responses=responses_log)
mixtral_prediction = [item['tag'] for item in pos_tags_json]

mixtral_prediction

['INTJ', 'CCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']

# Evaluation

In [17]:
print(example_sentence)

['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?']

In [14]:
print(example_true_tag)

['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']

In [18]:
print(bert_prediction)

['PRON', 'ADP', 'PROPN', 'VERB', 'VERB', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PUNCT']


In [16]:
print(mixtral_prediction)

['INTJ', 'CCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']