In [1]:
%%capture
!pip install -q -U langchain langchain_huggingface transformers bitsandbytes accelerate gradio pandas

In [2]:
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import gradio as gr
import pandas as pd
import json
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prompt iteration on two models (Mistral, Llama) to get the optimal results on test cases

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [13]:
def initialize_model_and_pipeline(model_name, template):
    """
    Initialize the model, tokenizer, and pipeline based on the model name.

    Args:
        model_name (str): The name of the model to load.
        template (str): The template string associated with the model.

    Returns:
        HuggingFacePipeline: A pipeline object wrapped in HuggingFacePipeline.
        str: The template string for the model.
    """
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=quantization_config,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Handle specific model configurations
    if "llama" in model_name:
        eos_token_id = [
                      tokenizer.eos_token_id,
                      tokenizer.convert_tokens_to_ids("<|eot_id|>")
                  ]
        pad_token_id = tokenizer.eos_token_id

    elif "mistral" in model_name:
        eos_token_id = tokenizer.eos_token_id
        pad_token_id = tokenizer.eos_token_id

    text_generation_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id= eos_token_id,
        pad_token_id = pad_token_id,
        max_new_tokens=256
    )

    llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
    return llm, template

In [6]:
def generate_response(template, signature):
    """
    Asks LLM for response using a given template and email signature.

    Args:
        template (str): The template to be used for generating the response from a specific LLM.
        signature (dict): email containing signature information.

    Returns:
        str: Plain response from LLM.
    """
    prompt = PromptTemplate(template=template, input_variables=["signature"])
    llm_chain = prompt | llm
    response = llm_chain.invoke({"signature":signature})
    return response

def grade_signatures(llm_response, ideal):
    """
    Grades the LLM response signature JSON against the ideal response JSON.

    Args:
        llm_response (dict): The JSON response from the language model.
        ideal (dict): The ideal JSON structure to compare against.

    Returns:
        tuple: A tuple containing:
            - score (int): The total score based on key presence and value matches.
            - total_possible_points (int): The total possible points for correct keys and values.
            - details (dict): A dictionary detailing the discrepancies:
                - "missing_keys" (list): Keys that are in the ideal JSON but missing in the response.
                - "extra_keys" (list): Keys that are in the response but not in the ideal JSON.
                - "value_mismatches" (dict): Keys that have mismatched values, with expected and found values.
    """
    total_possible_points = 0
    score = 0
    details = {
        "missing_keys": [],
        "extra_keys": [],
        "value_mismatches": {}
    }

    # Ensure both are dictionaries
    if not isinstance(llm_response, dict) or not isinstance(ideal, dict):
        return 0, "One of the inputs is not a dictionary"

    # Compare keys
    llm_response_keys = set(llm_response.keys())
    ideal_keys = set(ideal.keys())

    total_possible_points += len(ideal_keys)

    if llm_response_keys != ideal_keys:
        details["missing_keys"] = list(ideal_keys - llm_response_keys)
        details["extra_keys"] = list(llm_response_keys - ideal_keys)

    # Add points for correct keys
    score += len(ideal_keys - set(details["missing_keys"]))

    # Compare values
    for key in ideal_keys:
        if key in llm_response:
            total_possible_points += 1
            if llm_response[key] == ideal[key]:
                score += 1
            else:
                details["value_mismatches"][key] = {
                    "expected": ideal[key],
                    "found": llm_response[key]
                }

    return score, total_possible_points, details

def extract_json(template, email):
    """
    Extracts JSON content from an LLM response.

    Args:
        template (str): The template used to generate the response from a particular LLM.
        email (str): The email which contains signature infromation.

    Returns:
        dict: The extracted JSON content.
    """
    response = generate_response(template, email)

    part_with_json = response.replace('\n', '').split("```json")[-1] # split by the place where json begins with ```json

    json_str = part_with_json.split("```")[0] # split by ``` to get only json with brackets

    return json.loads(json_str)

In [7]:
template_mistral = """<s>[INST] Extract email signature information and structure it into a JSON format.
You have to give valid JSON with extracted NERs without any additional explanation.
Possible keys for extracted fields: first_name, last_name, company, position, phone, email, website, linkedin, twitter etc.
Structure the JSON output so that the extracted fields are all contained within a single object __sig__ .
Do not include not found values in JSON. Do not include academic degrees.
If there is several phone numbers, seperate them by comma and space under key `phone`.

Always start JSON with ```json and end with ```

Email signature is given below :
{signature} [/INST] </s>
"""

template_llama = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Extract email signature information and structure it into a JSON format.
You have to give valid JSON with extracted NERs without any additional explanation.
Structure the JSON output so that the extracted fields (like first name, last name, company, position, etc.) are all contained within a single object __sig__ .
Always start JSON with ```json and end with ```<|eot_id|><|start_header_id|>user<|end_header_id|>

Email signature is given below :
{signature}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

In [15]:
# Initialize the models
llm_name = "mistralai/Mistral-7B-Instruct-v0.3"  # or "meta-llama/Meta-Llama-3-8B-Instruct"
template = template_mistral if "Mistral" in llm_name else template_llama
llm, selected_template = initialize_model_and_pipeline(llm_name, template)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
with open('test_cases.txt', 'r') as file:
    content = file.read()

test_cases = content.split('\n---\n')  # Split the content by the delimiter
test_cases

['Hi Mike,\nI just wanted to follow up on the project we discussed during our meeting last week and see if you had any updates or needed any assistance from me.\nJane Doe\nSales & Marketing Director\nABC Company 3373 Gregory Lane, 40601 Frankfurt\nt: (800) 555-1133\ne: jane.doe@example.com\nwww.example.com',
 'Hello Maria,\nCan we schedule a meeting to discuss the marketing strategy?\n\nBest,\nTom\ntom@example.com',
 'Hi,\nPlease review the attached document for our next meeting.\n\nBest,\nБогдана Нікітченко\nNLP Engineer\nXYZ Ltd.\nPhone: (555) 123-4567\njane.doe@xyz.com\nwww.xyz.com',
 'Hi Ostap,\nThere is an update on project timelines. We have to schedule a meeting to discuss it. What is the most suitable time for you?\n\nBest,\nSarah Lee\nProduct Manager\nBeta Inc.\nDirect: (111) 222-3333\nMobile: (444) 555-6666\nOffice: (777) 888-9999\nsarah.lee@beta.com',
 "Hi Everyone,\nDon't forget to check our social media for updates.\n\nThanks,\nDavid Green\nHR Manager\nGamma Ltd.\ndavid.gr

In [10]:
with open('expected_jsons.json', 'r') as file:
    expected_jsons = json.load(file)

In [17]:
# initialize DataFrame to store results

data = {
    "score": [],
    "details_of_mismatches": [],
    "template": [],
    "ideal_output": [],
    "llm_output": []
}

df = pd.DataFrame(data)

In [18]:
for email_signature, ideal_response in zip(test_cases, expected_jsons):
  llm_response = extract_json(template_llama, email_signature)
  llm_respone_sig = llm_response.get("__sig__", {})
  ideal_sig = ideal_response.get("__sig__", {})

  score, total_possible_points, comparison_details = grade_signatures(llm_respone_sig, ideal_sig)
  percentage = (score / total_possible_points) * 100

  new_data = {
      "score": [percentage],
      "details_of_mismatches": [comparison_details],
      "template": [template_mistral],
      "ideal_output": [ideal_response],
      "llm_output": [llm_response]
  }

  new_df = pd.DataFrame(new_data)

  df = pd.concat([df, new_df], ignore_index=True)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [18]:
df

Unnamed: 0,score,details_of_mismatches,template,ideal_output,llm_output
0,100.0,"{'missing_keys': [], 'extra_keys': ['address']...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Jane', 'last_name'...","{'__sig__': {'first_name': 'Jane', 'last_name'..."
1,100.0,"{'missing_keys': [], 'extra_keys': ['address']...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Jane', 'last_name'...","{'__sig__': {'first_name': 'Jane', 'last_name'..."
2,100.0,"{'missing_keys': [], 'extra_keys': ['position'...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Tom', 'email': 'to...","{'__sig__': {'first_name': 'Tom', 'last_name':..."
3,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Богдана', 'last_na...","{'__sig__': {'first_name': 'Богдана', 'last_na..."
4,100.0,"{'missing_keys': [], 'extra_keys': ['website']...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Sarah', 'last_name...","{'__sig__': {'first_name': 'Sarah', 'last_name..."
5,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'David', 'last_name...","{'__sig__': {'first_name': 'David', 'last_name..."
6,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Rebecca', 'last_na...","{'__sig__': {'first_name': 'Rebecca', 'last_na..."
7,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Anna', 'last_name'...","{'__sig__': {'first_name': 'Anna', 'last_name'..."
8,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Robert', 'last_nam...","{'__sig__': {'first_name': 'Robert', 'last_nam..."
9,100.0,"{'missing_keys': [], 'extra_keys': [], 'value_...",<s>[INST] Extract email signature information ...,"{'__sig__': {'first_name': 'Michael', 'last_na...","{'__sig__': {'first_name': 'Michael', 'last_na..."


In [None]:
df.to_csv("mistral.csv")

## Custom Interface

In [20]:
def gradio_interface(email):
    response = extract_json(template_llama, email) # or template llama
    return json.dumps(response, indent=4, ensure_ascii=False)

iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=10, placeholder="Enter your email here..."),
    outputs="text",
    title="Email Signature Generator"
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://89af475ce14af94a3b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


