# Custom Entity Recognition (Proof of Concept)

### Necessary Libraries

In [1]:
import langchain
import os
from model import init_llm
from dotenv import load_dotenv
from prompts import get_blurb_prompt, get_eval_with_feature_prompt
import random
import time 
import json

verbose = False
load_dotenv()
file_path = 'dataset_100.jsonl'
eval_file_path = 'eval.jsonl'

### Utility functions

In [2]:
# Function to randomly select subsets of varying sizes
def select_random_subsets(input_list, num_subsets):
    subsets = []
    list_length = len(input_list)
    
    for _ in range(num_subsets):
        subset_size = random.randint(1, list_length)  # Random subset size
        subset = random.sample(input_list, subset_size)
        subsets.append(subset)
    
    return subsets

## Generation Phase

In [3]:
# initialize the language model
llm = init_llm()

prompt = get_blurb_prompt()

gen_query_chain = langchain.LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=verbose
    )

In [2]:
# Pre-defined feature sets

# features = ['price', 'phone_number', 'color', 'dimension']
features = ['color', 'dimension', 'price', 'location']
positive = ['color', 'dimension']
negative = ["price", "location"]

colors = ["red", "blue", "green", "yellow", "black", "white", "purple", "orange", "pink", "brown"]
phone_numbers = ["123-456-7890", "234-567-8901", "345-678-9012", "456-789-0123", "567-890-1234", "678-901-2345", "789-012-3456", "890-123-4567", "901-234-5678", "012-345-6789"]
prices = ["$100", "$200", "$300", "$400", "$500", "$600", "$700", "$800", "$900", "$1000"]
dimensions = ["1x1", "2x2", "3x3", "4x4", "5x5", "6x6", "7x7", "8x8", "9x9", "10x10"]
locations = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]

data = {'color': colors, 'location': locations, 'price': prices, 'dimension': dimensions}


In [5]:
random_subsets = select_random_subsets(features, 100)

ground_truths = []

for idx, feature_set in enumerate(random_subsets):

    ground_truth_feature = []
    ground_truth_value = []
    negative_feature = []
    negative_value = []
    
    vals = []
    

    for feature in feature_set:
        if feature == "dimension":
            dim = random.choice(data[feature])
            ground_truth_value.append(dim)
            ground_truth_feature.append(feature)
            vals.append(dim)
        
        if feature == "color":
            col = random.choice(data[feature])
            ground_truth_value.append(col)
            ground_truth_feature.append(feature)
            vals.append(col)

        if feature == "price":
            pri = random.choice(data[feature])
            negative_value.append(pri)
            negative_feature.append(feature)
            vals.append(pri)

        if feature == "phone_number":
            pho = random.choice(data[feature])
            negative_value.append(pho)
            negative_feature.append(feature)
            vals.append(pho)
            
        if feature == "location":
            loc = random.choice(data[feature])
            negative_value.append(loc)
            negative_feature.append(feature)
            vals.append(loc)

    output = gen_query_chain.predict(features=feature_set, values=vals)

    temp = {"Query": output,
            "Feature Set": feature_set, 
            "Values": vals, 
            "Ground Truth Features": ground_truth_feature, 
            "Ground Truth Values": ground_truth_value, 
            "Negative Features": negative_feature, 
            "Negative Values": negative_value}

    with open(file_path, "a") as f:
        json.dump(temp, f)  # Write JSON object
        f.write('\n')       # Add a newline to separate objects

## Evaluation Phase

In [3]:
# initialize the language model
llm = init_llm()

prompt = get_eval_with_feature_prompt()

eval_query_chain = langchain.LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=verbose
    )

In [5]:
with open(file_path, 'r') as file:
    for line in file:
        content = json.loads(line)
        output = eval_query_chain.predict(features=positive, post=content['Query'])
        with open(eval_file_path, "a") as f:
            temp = {"Response": output}
            json.dump(temp, f)  # Write JSON object
            f.write('\n')       # Add a newline to separate objects
