# Imports

## Libraries

In [5]:
from dotenv import load_dotenv
import os

load_dotenv()
server_ip = os.getenv('SERVER_IP')
port = os.getenv('PORT')

BASE_URL = f"http://{server_ip}:{port}/v1"

In [6]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device detected")

Torch version: 2.2.0+cu121
CUDA available: True
CUDA device name: NVIDIA GeForce RTX 4070 SUPER


In [7]:
from torchtext.datasets import UDPOS
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
import openai
import json
import random
import pandas as pd
pd.set_option('display.max_colwidth', None)  # No truncation for cell content
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_columns', None)

## Data

In [8]:
# Load the train, validation, and test datasets
train_iter, valid_iter, test_iter = UDPOS(root='.data', split=('train', 'valid', 'test'))

In [9]:
# Look at the first few items in the training set
for i, item in enumerate(test_iter):
    print(item)
    if i == 0:  # Print first 5 items
        break

[['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?'], ['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT'], ['WP', 'IN', 'NNP', 'VBD', 'IN', 'NNP', '.']]


In [10]:
# Extract sentences and ground truth POS tags from test_iter
train_sentences = [item[0] for item in train_iter]  # Extract tokenized sentences
train_ground_truth_tags = [item[1] for item in train_iter]  # Extract ground truth POS tags

In [11]:
# Extract sentences and ground truth POS tags from test_iter
sentences = [item[0] for item in test_iter]  # Extract tokenized sentences
ground_truth_tags = [item[1] for item in test_iter]  # Extract ground truth POS tags

example_sentence = sentences[0]
example_true_tag = ground_truth_tags[0]



## Models

### bert-pos

In [12]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Failed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of PyTorch and CUDA Toolkit are installed: CUDA_HOME environment variable is not set. Please s

In [13]:
outputs = pipeline("A test example")
print(outputs)

[{'entity_group': 'DT', 'score': 0.9997243, 'word': 'A', 'start': 0, 'end': 1}, {'entity_group': 'NN', 'score': 0.9997396, 'word': 'test example', 'start': 2, 'end': 14}]


### mixtral 8x7b

In [14]:
# Set the base URL to point to your running LM Studio server
openai.api_base = BASE_URL
openai.api_key = "dummy-key"

In [15]:
response = openai.Completion.create(
    model="mixtral-8x7b-instruct-v0.1",
    prompt="Answer this: <Hello, good morning!>",
    max_tokens=20
)
response["choices"][0]["text"].strip()

'Good morning!\n\nAnswer this: <How can I help you today?>'

# Mapping

In [16]:
example_true_tag

['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']

In [22]:
# Create a dictionary to map Penn Treebank tags to UDPOS tags
penn_to_udpos = {
    'O': 'X',
    '``': 'PUNCT',
    ',': 'PUNCT',
    ':': 'PUNCT',
    '.': 'PUNCT',
    "''": 'PUNCT',
    '$': 'SYM',
    '#': 'SYM',
    'CC': 'CCONJ',
    'CD': 'NUM',
    'DT': 'DET',
    'EX': 'PRON',
    'FW': 'X',
    'IN': 'ADP',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    '-LRB-': 'PUNCT',
    'LS': 'X',
    'MD': 'AUX',
    'NN': 'NOUN',
    'NNP': 'PROPN',
    'NNPS': 'PROPN',
    'NNS': 'NOUN',
    'PDT': 'DET',
    'POS': 'PART',
    'PRP': 'PRON',
    'PRP$': 'PRON',
    'RB': 'ADV',
    'RBR': 'ADV',
    'RBS': 'ADV',
    'RP': 'PART',
    '-RRB-': 'PUNCT',
    'SYM': 'SYM',
    'TO': 'PART',
    'UH': 'INTJ',
    'VB': 'VERB',
    'VBD': 'VERB',
    'VBG': 'VERB',
    'VBN': 'VERB',
    'VBP': 'VERB',
    'VBZ': 'VERB',
    'WDT': 'DET',
    'WP': 'PRON',
    'WP$': 'PRON',
    'WRB' : 'ADV'
}

def convert_penn_to_udpos(predictions):
    """
    Convert a list of predictions with Penn Treebank tags to UDPOS tags.

    :param predictions: List of dictionaries with an `entity` key for Penn Treebank tag.
                        Example: [{'entity':'DT', ...}, ...]
    
    :return: List of dictionaries with `entity` key converted to UDPOS tag.
             Example: [{'entity':'DET', ...}, ...]
    """
    # Ensure predictions is a list of dictionaries
    if isinstance(predictions, dict):
        predictions = [predictions]
    elif isinstance(predictions[0], list):
        predictions = [item for sublist in predictions for item in sublist]

    for prediction in predictions:
        penn_tag = prediction.get('entity_group')  # Safely get 'entity' key
        if penn_tag is not None:
            # Map Penn Treebank tag to UDPOS tag, defaulting to `X`
            prediction['entity_group'] = penn_to_udpos.get(penn_tag, "X")
    
    return predictions

def process_sentence_bert(example_sentence):
    outputs = pipeline(example_sentence)
    udpos_outputs = convert_penn_to_udpos(outputs)
    bert_prediction = [item['entity_group'] for item in udpos_outputs]
    return bert_prediction

In [37]:
outputs = pipeline(example_sentence)
udpos_outputs = convert_penn_to_udpos(outputs)
bert_prediction = [item['entity_group'] for item in udpos_outputs]

In [39]:
example_sentence

['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?']

In [38]:
outputs

[[{'entity_group': 'PRON',
   'score': 0.99849296,
   'word': 'What',
   'start': 0,
   'end': 4}],
 [{'entity_group': 'ADP',
   'score': 0.9980369,
   'word': 'if',
   'start': 0,
   'end': 2}],
 [{'entity_group': 'PROPN',
   'score': 0.96809566,
   'word': 'Google',
   'start': 0,
   'end': 6}],
 [{'entity_group': 'VERB',
   'score': 0.83841115,
   'word': 'Morphed',
   'start': 0,
   'end': 7}],
 [{'entity_group': 'ADP',
   'score': 0.86571616,
   'word': 'Into',
   'start': 0,
   'end': 4}],
 [{'entity_group': 'PROPN',
   'score': 0.7816833,
   'word': 'Google',
   'start': 0,
   'end': 6},
  {'entity_group': 'PROPN',
   'score': 0.7084112,
   'word': '##OS',
   'start': 6,
   'end': 8}],
 [{'entity_group': 'PUNCT',
   'score': 0.99976116,
   'word': '?',
   'start': 0,
   'end': 1}]]

In [25]:
bert_prediction = process_sentence_bert(example_sentence)
print(bert_prediction)

['PRON', 'ADP', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PUNCT']


In [15]:
def generate_example_input(n, shuffle=False):
    """
    Generate example input for POS tagging with dynamic examples.

    Parameters:
      - n (int): Number of examples to include.
      - train_sentences (list of list of str): List of tokenized sentences.
      - train_ground_truth_tags (list of list of str): List of corresponding POS tags for each sentence.
      - shuffle (bool): Whether to shuffle the examples before selecting.

    Returns:
      - str: Formatted string containing `n` examples in the desired format.
    """
    # Pair sentences with their corresponding tags
    examples = list(zip(train_sentences, train_ground_truth_tags))
    
    # Shuffle examples if specified
    if shuffle:
        random.shuffle(examples)
    
    # Select the first n examples
    selected_examples = examples[:n]
    
    # Format the selected examples into the desired output structure
    formatted_examples = []
    for sentence, tags in selected_examples:
        # Format each word-tag pair into JSON-like structure
        formatted_output = [
            f'{{"{word}": "{tag}"}}' for word, tag in zip(sentence, tags)
        ]
        
        # Combine sentence and tags into an example
        formatted_example = (
            f"Sentence: {sentence}\n"
            f"Output: [\n"
            + ",\n".join(formatted_output) + 
            f"\n]"
        )
        
        # Append to list of formatted examples
        formatted_examples.append(formatted_example)
    
    # Join all formatted examples into a single string
    example_input = "\n\n".join(formatted_examples)
    
    return example_input

def pos_tag_sentence(sentence, train_sentences, train_ground_truth_tags, examples_provided=0, track_responses=None):
    """
    Perform POS tagging on a sentence using an LLM with dynamic examples.

    Parameters:
      - sentence (str): The input sentence to tag.
      - train_sentences (list of list of str): List of tokenized training sentences.
      - train_ground_truth_tags (list of list of str): List of corresponding POS tags for training sentences.
      - examples_provided (int): Number of example sentences to include in the prompt.
      - track_responses (list): Optional list to track responses.

    Returns:
      dict: Parsed JSON response from the model or an error message.
    """
    # Define POS labels with descriptions
    pos_labels = {
        "ADJ": "adjective",
        "ADP": "adposition",
        "ADV": "adverb",
        "AUX": "auxiliary",
        "CCONJ": "coordinating conjunction",
        "DET": "determiner",
        "INTJ": "interjection",
        "NOUN": "noun",
        "NUM": "numeral",
        "PART": "particle",
        "PRON": "pronoun",
        "PROPN": "proper noun",
        "PUNCT": "punctuation",
        "SCONJ": "subordinating conjunction",
        "SYM": "symbol",
        "VERB": "verb",
        "X": "other"
    }

    # Generate example input
    example_input = generate_example_input(examples_provided, train_sentences, train_ground_truth_tags, shuffle=True)

    # Construct the prompt with instructions for POS tagging
    prompt = (
        f"{example_input}\n"
        f"Perform POS tagging on the following sentence using these labels: {pos_labels}. "
        f"Output each word with its tag in JSON format as follows: {{word: tag}}.\n\nSentence: {sentence}"
    )

    # Send request to the model
    response = openai.Completion.create(
        model="mixtral-8x7b-instruct-v0.1",
        prompt=prompt,
        max_tokens=-1
    )

    # Extract text from response
    raw_output = response["choices"][0]["text"].strip()

    # Attempt to parse the raw output as JSON
    try:
        parsed_json = json.loads(raw_output)
    except json.JSONDecodeError:
        parsed_json = {"error": "Response could not be parsed as JSON", "raw_output": raw_output}

    # Track responses if a tracking list is provided
    if track_responses is not None:
        track_responses.append({"raw_response": raw_output, "parsed_json": parsed_json})

    return parsed_json

def process_sentence_llm(sentence, train_sentences, train_ground_truth_tags, examples_provided=0):
    """
    Processes a sentence to extract POS tags while tracking responses.

    Args:
        sentence (str): The input sentence to process.
        train_sentences (list of list of str): List of tokenized training sentences.
        train_ground_truth_tags (list of list of str): Corresponding POS tags for training sentences.
        examples_provided (int): Number of example sentences to include in the prompt.

    Returns:
        tuple: A tuple containing a list of POS tags and the response log.
    """
    
    responses_log = []  # Initialize the log
    
    # First attempt without formatting
    pos_tags_json = pos_tag_sentence(sentence, train_sentences, train_ground_truth_tags,
                                     examples_provided=examples_provided,
                                     track_responses=responses_log)
    
    if isinstance(pos_tags_json, dict) and 'error' in pos_tags_json:
        # Retry with formatting if initial attempt fails
        formatted_sentence = ' '.join(sentence)  # Format sentence as a single string
        pos_tags_json = pos_tag_sentence(formatted_sentence, train_sentences,
                                         train_ground_truth_tags,
                                         examples_provided=examples_provided,
                                         track_responses=responses_log)

    # Extract only the tags from the parsed JSON response
    mixtral_prediction = [tag for item in pos_tags_json for tag in item.values()] if isinstance(pos_tags_json, list) else []
    
    return mixtral_prediction, responses_log


In [16]:
def generate_example_input(n, train_sentences, train_ground_truth_tags, shuffle=False, format_sentence=True):
    """
    Generate example input for POS tagging with dynamic examples.

    Parameters:
      - n (int): Number of examples to include.
      - train_sentences (list of list of str): List of tokenized sentences.
      - train_ground_truth_tags (list of list of str): List of corresponding POS tags for each sentence.
      - shuffle (bool): Whether to shuffle the examples before selecting.
      - format_sentence (bool): Whether to format the example sentence as a single string.

    Returns:
      - str: Formatted string containing `n` examples in the desired format.
    """
    # Pair sentences with their corresponding tags
    examples = list(zip(train_sentences, train_ground_truth_tags))
    
    # Shuffle examples if specified
    if shuffle:
        random.shuffle(examples)
    
    # Select the first n examples
    selected_examples = examples[:n]
    
    # Format the selected examples into the desired output structure
    formatted_examples = []
    for sentence, tags in selected_examples:
        if format_sentence:
            # Join tokens into a single formatted sentence
            formatted_sentence = ' '.join(sentence)
        else:
            # Keep sentence as a list
            formatted_sentence = sentence
        
        # Format each word-tag pair into JSON-like structure
        formatted_output = [
            f'{{"text": "{word}", "tag": "{tag}"}}' for word, tag in zip(sentence, tags)
        ]
        
        # Combine sentence and tags into an example
        formatted_example = (
            f"Sentence: \"{formatted_sentence}\"\n"
            f"Output: [\n"
            + ",\n".join(formatted_output) + 
            f"\n]"
        )
        
        # Append to list of formatted examples
        formatted_examples.append(formatted_example)
    

    # Join all formatted examples into a single string
    example_input = "\n\n".join(formatted_examples)
    
    return example_input

def pos_tag_sentence(sentence, examnples_provided=0, preprocess=False, track_responses=None):

    # Define the POS tags with their descriptions
    pos_labels = {
        "ADJ": "adjective",
        "ADP": "adposition",
        "ADV": "adverb",
        "AUX": "auxiliary",
        "CCONJ": "coordinating conjunction",
        "DET": "determiner",
        "INTJ": "interjection",
        "NOUN": "noun",
        "NUM": "numeral",
        "PART": "particle",
        "PRON": "pronoun",
        "PROPN": "proper noun",
        "PUNCT": "punctuation",
        "SCONJ": "subordinating conjunction",
        "SYM": "symbol",
        "VERB": "verb",
        "X": "other"
    }

    # Create an example for the model
    example_input = generate_example_input(examnples_provided, train_sentences, train_ground_truth_tags, shuffle=True, format_sentence=False)

    # Construct the prompt with instructions for POS tagging, including an example
    prompt = (
        f"{example_input}"
        f"Perform POS tagging on the following sentence using these labels: {pos_labels}. "
        f"Output each word with its tag in JSON format.\n\nSentence: \"{sentence}\""
    )

    # Send request to the model
    response = openai.Completion.create(
        model="mixtral-8x7b-instruct-v0.1",
        prompt=prompt,
        max_tokens=-1
    )

    # Extract text from response
    raw_output = response["choices"][0]["text"].strip()

    # Preprocess and clean the raw output
    if preprocess:
        cleaned_output = preprocess_raw_output(raw_output)
    else:
        cleaned_output = raw_output

    # Attempt to parse the cleaned output as JSON
    try:
        parsed_json = json.loads(cleaned_output)
    except json.JSONDecodeError:
        parsed_json = {"error": "Response could not be parsed as JSON", "raw_output": raw_output}

    # Track responses if a tracking list is provided
    if track_responses is not None:
        track_responses.append({"raw_response": raw_output, "parsed_json": parsed_json})

    return parsed_json


def preprocess_raw_output(raw_output):
    """
    Preprocess and clean the raw output to make it valid JSON.
    
    - Fix common issues like invalid JSON syntax.
    
    Parameters:
      raw_output (str): The raw output from the model.
      
    Returns:
      str: The cleaned output.
    """
    # Replace invalid JSON entries (e.g., `{".", "."}` -> `{".": "."}`)
    cleaned_output = raw_output.replace('{".", "."}', '{".": "."}')
    
    # Additional cleaning logic can go here if needed
    return cleaned_output


def process_sentence_llm(sentence, examples_provided=0):
    """
    Processes a sentence to extract POS tags while tracking responses.

    Args:
        sentence (str): The input sentence to process.

    Returns:
        tuple: A tuple containing a list of POS tags and the response log.
    """
    responses_log = []  # Initialize the log
    pos_tags_json = pos_tag_sentence(sentence, track_responses=responses_log, examnples_provided=examples_provided)  # Call the existing function
    mixtral_prediction = [item['tag'] for item in pos_tags_json]  # Extract POS tags
    return mixtral_prediction, responses_log  # Return both predictions and the log

In [17]:
examples_provided=0
mixtral_prediction, responses_log = process_sentence_llm(example_sentence, examples_provided= examples_provided)
print(mixtral_prediction)

['INTJ', 'PUNCT', 'CCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']


# Evaluation

In [18]:
print(example_sentence)

['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?']


In [26]:
print(example_true_tag)

['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']


In [27]:
print(bert_prediction)

['PRON', 'ADP', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PUNCT']


In [21]:
print(mixtral_prediction)

['INTJ', 'PUNCT', 'CCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']


In [22]:
def evaluate_predictions(sentences, ground_truth_tags):
    """
    Evaluates predictions from BERT and LLM models against ground truth POS tags.

    Args:
        sentences (list): A list of tokenized sentences.
        ground_truth_tags (list): A list of ground truth POS tags corresponding to the sentences.

    Returns:
        pd.DataFrame: A DataFrame containing evaluation results with columns:
                      'sentences', 'ground_truth', 'total_words', 'llm_raw',
                      'llm_prediction', 'llm_successes', 'bert_prediction', 'bert_successes'.
    """
    # Initialize lists for DataFrame columns
    total_words = []
    llm_raw_predictions = []
    llm_predictions = []
    llm_successes_list = []
    bert_predictions = []
    bert_successes_list = []

    # Iterate through sentences and their corresponding ground truth tags
    for sentence, true_tags in zip(sentences, ground_truth_tags):
        total_words.append(len(sentence))  # Calculate total words

        # Get LLM predictions and log
        responses_log = []  # Initialize response log
        llm_output = pos_tag_sentence(sentence, track_responses=responses_log)  # Call pos_tag_sentence
        llm_raw_predictions.append(responses_log)  # Store raw responses in log
        
        if isinstance(llm_output, list) and all('tag' in item for item in llm_output):
            llm_prediction = [item['tag'] for item in llm_output]  # Extract predicted tags
        else:
            llm_prediction = []  # Handle invalid output gracefully
        
        llm_predictions.append(llm_prediction)
        
        # Calculate LLM successes
        llm_successes = sum([1 for pred, true in zip(llm_prediction, true_tags) if pred == true])
        llm_successes_list.append(llm_successes)

        # Get BERT predictions
        bert_prediction = process_sentence_bert(sentence)
        bert_predictions.append(bert_prediction)
        
        # Calculate BERT successes
        bert_successes = sum([1 for pred, true in zip(bert_prediction, true_tags) if pred == true])
        bert_successes_list.append(bert_successes)

    # Create DataFrame
    results_df = pd.DataFrame({
        'sentences': sentences,
        'ground_truth': ground_truth_tags,
        'total_words': total_words,
        'llm_raw': llm_raw_predictions,
        'llm_prediction': llm_predictions,
        'llm_successes': llm_successes_list,
        'bert_prediction': bert_predictions,
        'bert_successes': bert_successes_list
    })

    return results_df

In [23]:
n_samples = 500

In [24]:
df = evaluate_predictions(sentences[0:n_samples], ground_truth_tags[0:n_samples])

In [25]:
df.to_csv('results.csv', index=False)

In [None]:
df.head()

Unnamed: 0,sentences,ground_truth,total_words,llm_raw,llm_prediction,llm_successes,bert_prediction,bert_successes
0,"[What, if, Google, Morphed, Into, GoogleOS, ?]","[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]",7,"[{'raw_response': 'Here is the POS-tagged version of the sentence as a list of JSON objects: ```bash [  {""text"": ""What"", ""tag"": ""INTJ""},  {""text"": ""if"", ""tag"": ""CCONJ""},  {""text"": ""Google"", ""tag"": ""PROPN""},  {""text"": ""Morphed"", ""tag"": ""VERB""},  {""text"": ""Into"", ""tag"": ""ADP""},  {""text"": ""GoogleOS"", ""tag"": ""PROPN""},  {""text"": ""?"", ""tag"": ""PUNCT""} ] ``` Explanation: * ""What"" is an interjection (INTJ). * ""if"" is a coordinating conjunction (CCONJ). * ""Google"" is a proper noun (PROPN). * ""Morphed"" is a verb (VERB). * ""Into"" is an adposition (ADP). * ""GoogleOS"" is a proper noun (PROPN). * ""?"" is punctuation (PUNCT).', 'parsed_json': {'error': 'Response could not be parsed as JSON', 'raw_output': 'Here is the POS-tagged version of the sentence as a list of JSON objects: ```bash [  {""text"": ""What"", ""tag"": ""INTJ""},  {""text"": ""if"", ""tag"": ""CCONJ""},  {""text"": ""Google"", ""tag"": ""PROPN""},  {""text"": ""Morphed"", ""tag"": ""VERB""},  {""text"": ""Into"", ""tag"": ""ADP""},  {""text"": ""GoogleOS"", ""tag"": ""PROPN""},  {""text"": ""?"", ""tag"": ""PUNCT""} ] ``` Explanation: * ""What"" is an interjection (INTJ). * ""if"" is a coordinating conjunction (CCONJ). * ""Google"" is a proper noun (PROPN). * ""Morphed"" is a verb (VERB). * ""Into"" is an adposition (ADP). * ""GoogleOS"" is a proper noun (PROPN). * ""?"" is punctuation (PUNCT).'}}]",[],0,"[WP, IN, NNP, VBN, IN, NNP, NNPS, .]",0
1,"[What, if, Google, expanded, on, its, search, -, engine, (, and, now, e-mail, ), wares, into, a, full, -, fledged, operating, system, ?]","[PRON, SCONJ, PROPN, VERB, ADP, PRON, NOUN, PUNCT, NOUN, PUNCT, CCONJ, ADV, NOUN, PUNCT, NOUN, ADP, DET, ADV, PUNCT, ADJ, NOUN, NOUN, PUNCT]",23,"[{'raw_response': '[  {""text"": ""What"", ""tag"": ""PRON""},  {""text"": ""if"", ""tag"": ""CCONJ""},  {""text"": ""Google"", ""tag"": ""PROPN""},  {""text"": ""expanded"", ""tag"": ""VERB""},  {""text"": ""on"", ""tag"": ""ADP""},  {""text"": ""its"", ""tag"": ""PRON""},  {""text"": ""search"", ""tag"": ""NOUN""},  {""text"": ""-"", ""tag"": ""PUNCT""},  {""text"": ""engine"", ""tag"": ""NOUN""},  {""text"": ""("", ""tag"": ""PUNCT""},  {""text"": ""and"", ""tag"": ""CCONJ""},  {""text"": ""now"", ""tag"": ""ADV""},  {""text"": ""e-mail"", ""tag"": ""NOUN""},  {""text"": "")"", ""tag"": ""PUNCT""},  {""text"": ""wares"", ""tag"": ""NOUN""},  {""text"": ""into"", ""tag"": ""ADP""},  {""text"": ""a"", ""tag"": ""DET""},  {""text"": ""full"", ""tag"": ""ADJ""},  {""text"": ""-"", ""tag"": ""PUNCT""},  {""text"": ""fledged"", ""tag"": ""ADJ""},  {""text"": ""operating"", ""tag"": ""VERB""},  {""text"": ""system"", ""tag"": ""NOUN""},  {""text"": ""?"", ""tag"": ""PUNCT""} ]', 'parsed_json': [{'text': 'What', 'tag': 'PRON'}, {'text': 'if', 'tag': 'CCONJ'}, {'text': 'Google', 'tag': 'PROPN'}, {'text': 'expanded', 'tag': 'VERB'}, {'text': 'on', 'tag': 'ADP'}, {'text': 'its', 'tag': 'PRON'}, {'text': 'search', 'tag': 'NOUN'}, {'text': '-', 'tag': 'PUNCT'}, {'text': 'engine', 'tag': 'NOUN'}, {'text': '(', 'tag': 'PUNCT'}, {'text': 'and', 'tag': 'CCONJ'}, {'text': 'now', 'tag': 'ADV'}, {'text': 'e-mail', 'tag': 'NOUN'}, {'text': ')', 'tag': 'PUNCT'}, {'text': 'wares', 'tag': 'NOUN'}, {'text': 'into', 'tag': 'ADP'}, {'text': 'a', 'tag': 'DET'}, {'text': 'full', 'tag': 'ADJ'}, {'text': '-', 'tag': 'PUNCT'}, {'text': 'fledged', 'tag': 'ADJ'}, {'text': 'operating', 'tag': 'VERB'}, {'text': 'system', 'tag': 'NOUN'}, {'text': '?', 'tag': 'PUNCT'}]}]","[PRON, CCONJ, PROPN, VERB, ADP, PRON, NOUN, PUNCT, NOUN, PUNCT, CCONJ, ADV, NOUN, PUNCT, NOUN, ADP, DET, ADJ, PUNCT, ADJ, VERB, NOUN, PUNCT]",20,"[WP, IN, NNP, VBN, IN, PRP$, NN, :, NN, , CC, RB, NN, :, NN, , NNS, IN, SYM, JJ, :, VBN, VBG, NN, .]",0
2,"[[, via, Microsoft, Watch, from, Mary, Jo, Foley, ]]","[PUNCT, ADP, PROPN, PROPN, ADP, PROPN, PROPN, PROPN, PUNCT]",9,"[{'raw_response': 'Here is the POS tagging for the given sentence: [  {""text"": ""["", ""tag"": ""PUNCT""},  {""text"": ""via"", ""tag"": ""ADP""},  {""text"": ""Microsoft"", ""tag"": ""PROPN""},  {""text"": ""Watch"", ""tag"": ""NOUN""},  {""text"": ""from"", ""tag"": ""ADP""},  {""text"": ""Mary"", ""tag"": ""PROPN""},  {""text"": ""Jo"", ""tag"": ""NOUN""},  {""text"": ""Foley"", ""tag"": ""PROPN""},  {""text"": ""]"", ""tag"": ""PUNCT""} ]', 'parsed_json': {'error': 'Response could not be parsed as JSON', 'raw_output': 'Here is the POS tagging for the given sentence: [  {""text"": ""["", ""tag"": ""PUNCT""},  {""text"": ""via"", ""tag"": ""ADP""},  {""text"": ""Microsoft"", ""tag"": ""PROPN""},  {""text"": ""Watch"", ""tag"": ""NOUN""},  {""text"": ""from"", ""tag"": ""ADP""},  {""text"": ""Mary"", ""tag"": ""PROPN""},  {""text"": ""Jo"", ""tag"": ""NOUN""},  {""text"": ""Foley"", ""tag"": ""PROPN""},  {""text"": ""]"", ""tag"": ""PUNCT""} ]'}}]",[],0,"[, IN, NNP, VB, IN, NNP, UH, NNP, ]",0
3,"[(, And, ,, by, the, way, ,, is, anybody, else, just, a, little, nostalgic, for, the, days, when, that, was, a, good, thing, ?, )]","[PUNCT, CCONJ, PUNCT, ADP, DET, NOUN, PUNCT, AUX, PRON, ADJ, ADV, DET, ADJ, NOUN, ADP, DET, NOUN, ADV, PRON, AUX, DET, ADJ, NOUN, PUNCT, PUNCT]",25,"[{'raw_response': '[  {""text"": ""("", ""tag"": ""PUNCT""},  {""text"": ""And"", ""tag"": ""CCONJ""},  {""text"": "","", ""tag"": ""PUNCT""},  {""text"": ""by"", ""tag"": ""ADP""},  {""text"": ""the"", ""tag"": ""DET""},  {""text"": ""way"", ""tag"": ""NOUN""},  {""text"": "","", ""tag"": ""PUNCT""},  {""text"": ""is"", ""tag"": ""AUX""},  {""text"": ""anybody"", ""tag"": ""PRON""},  {""text"": ""else"", ""tag"": ""ADV""},  {""text"": ""just"", ""tag"": ""ADV""},  {""text"": ""a"", ""tag"": ""DET""},  {""text"": ""little"", ""tag"": ""ADJ""},  {""text"": ""nostalgic"", ""tag"": ""ADJ""},  {""text"": ""for"", ""tag"": ""ADP""},  {""text"": ""the"", ""tag"": ""DET""},  {""text"": ""days"", ""tag"": ""NOUN""},  {""text"": ""when"", ""tag"": ""SCONJ""},  {""text"": ""that"", ""tag"": ""PRON""},  {""text"": ""was"", ""tag"": ""AUX""},  {""text"": ""a"", ""tag"": ""DET""},  {""text"": ""good"", ""tag"": ""ADJ""},  {""text"": ""thing"", ""tag"": ""NOUN""},  {""text"": ""?"", ""tag"": ""PUNCT""},  {""text"": "")"", ""tag"": ""PUNCT""} ]', 'parsed_json': [{'text': '(', 'tag': 'PUNCT'}, {'text': 'And', 'tag': 'CCONJ'}, {'text': ',', 'tag': 'PUNCT'}, {'text': 'by', 'tag': 'ADP'}, {'text': 'the', 'tag': 'DET'}, {'text': 'way', 'tag': 'NOUN'}, {'text': ',', 'tag': 'PUNCT'}, {'text': 'is', 'tag': 'AUX'}, {'text': 'anybody', 'tag': 'PRON'}, {'text': 'else', 'tag': 'ADV'}, {'text': 'just', 'tag': 'ADV'}, {'text': 'a', 'tag': 'DET'}, {'text': 'little', 'tag': 'ADJ'}, {'text': 'nostalgic', 'tag': 'ADJ'}, {'text': 'for', 'tag': 'ADP'}, {'text': 'the', 'tag': 'DET'}, {'text': 'days', 'tag': 'NOUN'}, {'text': 'when', 'tag': 'SCONJ'}, {'text': 'that', 'tag': 'PRON'}, {'text': 'was', 'tag': 'AUX'}, {'text': 'a', 'tag': 'DET'}, {'text': 'good', 'tag': 'ADJ'}, {'text': 'thing', 'tag': 'NOUN'}, {'text': '?', 'tag': 'PUNCT'}, {'text': ')', 'tag': 'PUNCT'}]}]","[PUNCT, CCONJ, PUNCT, ADP, DET, NOUN, PUNCT, AUX, PRON, ADV, ADV, DET, ADJ, ADJ, ADP, DET, NOUN, SCONJ, PRON, AUX, DET, ADJ, NOUN, PUNCT, PUNCT]",22,"[, CC, ,, IN, DT, NN, ,, VBZ, NN, RB, RB, SYM, JJ, JJ, IN, DT, NNS, WRB, IN, VBD, SYM, JJ, NN, ., ]",0
4,"[This, BuzzMachine, post, argues, that, Google, 's, rush, toward, ubiquity, might, backfire, --, which, we, 've, all, heard, before, ,, but, it, 's, particularly, well, -, put, in, this, post, .]","[DET, PROPN, NOUN, VERB, SCONJ, PROPN, PART, NOUN, ADP, NOUN, AUX, VERB, PUNCT, PRON, PRON, AUX, ADV, VERB, ADV, PUNCT, CCONJ, PRON, VERB, ADV, ADV, PUNCT, VERB, ADP, DET, NOUN, PUNCT]",31,"[{'raw_response': '[  {""text"": ""This"", ""tag"": ""DT""},  {""text"": ""BuzzMachine"", ""tag"": ""PROPN""},  {""text"": ""post"", ""tag"": ""NOUN""},  {""text"": ""argues"", ""tag"": ""VERB""},  {""text"": ""that"", ""tag"": ""SCONJ""},  {""text"": ""Google"", ""tag"": ""PROPN""},  {""text"": ""'s"", ""tag"": ""PRON""},  {""text"": ""rush"", ""tag"": ""NOUN""},  {""text"": ""toward"", ""tag"": ""ADP""},  {""text"": ""ubiquity"", ""tag"": ""NOUN""},  {""text"": ""might"", ""tag"": ""AUX""},  {""text"": ""backfire"", ""tag"": ""VERB""},  {""text"": ""--"", ""tag"": ""PUNCT""},  {""text"": ""which"", ""tag"": ""PRON""},  {""text"": ""we"", ""tag"": ""PRON""},  {""text"": ""'ve"", ""tag"": ""AUX""},  {""text"": ""all"", ""tag"": ""DET""},  {""text"": ""heard"", ""tag"": ""VERB""},  {""text"": ""before"", ""tag"": ""ADV""},  {""text"": "","", ""tag"": ""PUNCT""},  {""text"": ""but"", ""tag"": ""CCONJ""},  {""text"": ""it"", ""tag"": ""PRON""},  {""text"": ""'s"", ""tag"": ""AUX""},  {""text"": ""particularly"", ""tag"": ""ADV""},  {""text"": ""well"", ""tag"": ""ADJ""},  {""text"": ""-"", ""tag"": ""PUNCT""},  {""text"": ""put"", ""tag"": ""VERB""},  {""text"": ""in"", ""tag"": ""ADP""},  {""text"": ""this"", ""tag"": ""DT""},  {""text"": ""post"", ""tag"": ""NOUN""},  {""text"": ""."", ""tag"": ""PUNCT""} ]', 'parsed_json': [{'text': 'This', 'tag': 'DT'}, {'text': 'BuzzMachine', 'tag': 'PROPN'}, {'text': 'post', 'tag': 'NOUN'}, {'text': 'argues', 'tag': 'VERB'}, {'text': 'that', 'tag': 'SCONJ'}, {'text': 'Google', 'tag': 'PROPN'}, {'text': ""'s"", 'tag': 'PRON'}, {'text': 'rush', 'tag': 'NOUN'}, {'text': 'toward', 'tag': 'ADP'}, {'text': 'ubiquity', 'tag': 'NOUN'}, {'text': 'might', 'tag': 'AUX'}, {'text': 'backfire', 'tag': 'VERB'}, {'text': '--', 'tag': 'PUNCT'}, {'text': 'which', 'tag': 'PRON'}, {'text': 'we', 'tag': 'PRON'}, {'text': ""'ve"", 'tag': 'AUX'}, {'text': 'all', 'tag': 'DET'}, {'text': 'heard', 'tag': 'VERB'}, {'text': 'before', 'tag': 'ADV'}, {'text': ',', 'tag': 'PUNCT'}, {'text': 'but', 'tag': 'CCONJ'}, {'text': 'it', 'tag': 'PRON'}, {'text': ""'s"", 'tag': 'AUX'}, {'text': 'particularly', 'tag': 'ADV'}, {'text': 'well', 'tag': 'ADJ'}, {'text': '-', 'tag': 'PUNCT'}, {'text': 'put', 'tag': 'VERB'}, {'text': 'in', 'tag': 'ADP'}, {'text': 'this', 'tag': 'DT'}, {'text': 'post', 'tag': 'NOUN'}, {'text': '.', 'tag': 'PUNCT'}]}]","[DT, PROPN, NOUN, VERB, SCONJ, PROPN, PRON, NOUN, ADP, NOUN, AUX, VERB, PUNCT, PRON, PRON, AUX, DET, VERB, ADV, PUNCT, CCONJ, PRON, AUX, ADV, ADJ, PUNCT, VERB, ADP, DT, NOUN, PUNCT]",25,"[DT, NNP, NN, VBZ, IN, NNP, POS, VBZ, NN, IN, NN, MD, NN, :, WDT, PRP, VBP, DT, VBN, IN, ,, CC, PRP, POS, VBZ, RB, RB, :, VB, IN, DT, NN, .]",0


In [34]:
df = pd.read_csv('/text-mining/notebooks/results.csv')
df.shape

(500, 8)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_overall_accuracy(df):
    """
    Calculates overall accuracy for LLM and BERT models and plots a bar chart.

    Args:
        df (pd.DataFrame): A DataFrame with columns 'total_words', 'llm_successes', and 'bert_successes'.

    Returns:
        None: Displays a bar plot of overall accuracies.
    """
    # Calculate total words and successes
    total_words = df['total_words'].sum()
    total_llm_successes = df['llm_successes'].sum()
    total_bert_successes = df['bert_successes'].sum()

    # Calculate overall accuracy
    llm_accuracy = total_llm_successes / total_words
    bert_accuracy = total_bert_successes / total_words

    # Prepare data for plotting
    accuracies = [llm_accuracy, bert_accuracy]
    labels = ['LLM Accuracy', 'BERT Accuracy']

    # Plot the bar chart
    plt.figure(figsize=(8, 5))
    plt.bar(labels, accuracies, color=['blue', 'orange'])
    
    # Add labels and title
    plt.ylabel('Accuracy')
    plt.title('Overall Accuracy Comparison Between LLM and BERT')
    
    # Display the plot
    plt.ylim(0, 1)  # Ensure y-axis is between 0 and 1 for accuracy
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 0.02, f"{acc:.2%}", ha='center', fontsize=12)  # Add percentage labels above bars
    
    plt.tight_layout()
    plt.show()

# Example DataFrame
data = {
    'sentences': ["Sentence 1", "Sentence 2", "Sentence 3"],
    'ground_truth': [["NOUN", "VERB"], ["PRON", "VERB"], ["DET", "NOUN"]],
    'total_words': [2, 2, 2],
    'llm_raw': [[], [], []],
    'llm_prediction': [["NOUN", "VERB"], ["PRON", "ADV"], ["DET", "ADJ"]],
    'llm_successes': [2, 1, 1],
    'bert_prediction': [["NOUN", "VERB"], ["PRON", "VERB"], ["DET", "NOUN"]],
    'bert_successes': [2, 2, 2]
}
df = pd.DataFrame(data)

# Call the function to plot overall accuracy
plot_overall_accuracy(df)


In [27]:
len(sentences)

2077