# WORKHUMAN ASSESSMENT SOLUTION

- ## Import necessary libraries

In [1]:
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
import warnings
warnings.filterwarnings('ignore')

- ## Create a sample DataFrame

In [2]:
# Dummy data for testing
data = {
    "division": ["sports", "sports", "sports", "politics", "politics", "politics",
                 "entertainment", "entertainment", "entertainment", "crime", "crime", "crime",
                 "technology", "technology", "technology", "lifestyle", "lifestyle", "lifestyle", "economy", "economy"],
    "country": ["USA", "UK", "India", "USA", "UK", "India", "USA", "UK", "India", "USA", "UK", "India",
                "Canada", "Ireland", "Canada", "Ireland", "Canada", "Ireland", "Canada", "Ireland"],
    "articles_count": [50, 30, 20, 25, 15, 35, 40, 22, 18, 27, 10, 33, 19, 29, 21, 14, 12, 32, 31, 23]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,division,country,articles_count
0,sports,USA,50
1,sports,UK,30
2,sports,India,20
3,politics,USA,25
4,politics,UK,15


- ## Load API keys

In [3]:
import os
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())
oa_api_key = str(os.getenv("OA_API_KEY"))
ls_api_key = str(os.environ['LANGSMITH_API_KEY'])

- ## Register the application with LangSmith

In [4]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "workhuman_assessment"

from langsmith import Client
client = Client(api_key=ls_api_key)

- ## Initialize LLM and define output parser

In [5]:
llm = OpenAI(api_key=oa_api_key)

class ArticleQueryResponse(BaseModel):
    Columns: list[str] = Field(default=["articles_count"])
    Filters: dict = Field(description="Filters used to retrieve the count of articles")
    articles_count: int = Field(description="Retrieved count of matching articles")
    error: str = Field(default=None)

parser = JsonOutputParser(pydantic_object=ArticleQueryResponse)

- ## Define the prompt template

In [6]:
template = """
You are an intelligent assistant that helps the internal members of an international newspaper.
Your main task is to process user queries regarding article counts in specific divisions and countries.
Determine if the query is relevant to article counts in specific divisions and countries:
Examples of relevant queries:
1. "Tell me the number of articles published by journalists in the Sports journalism division in the United States."
2. "How many articles were published in the Entertainment division in France?"
3. "What is the article count for the Politics division in Germany?"
Examples of irrelevant queries:
1. "Who is Sergio Ramos?"
2. "What is the weather like in New York?"
3. "Tell me about the history of the Eiffel Tower."
Steps:
1. Analyze the query to identify if it includes keywords related to article counts such as "number of articles", "article counts", "division", and "country".
2. If the query is not relevant, return:
{{
    "error": "query not relevant to the use case"
}}
3. If the query is relevant, extract the entities from the user query.
4. Use the provided DataFrame to fetch the number of articles for the identified division and country.
5. If no matching records are found for the specified division and country in the DataFrame, return:
{{
    "Columns": ["articles_count"],
    "Filters": {{
        "Division": "<extracted division>",
        "Country": "<extracted country>"
    }},
    "articles_count": 0,
    "error": "no data available"
}}
6. If matching records are found, return:
{{
    "Columns": ["articles_count"],
    "Filters": {{
        "Division": "<extracted division>",
        "Country": "<extracted country>"
    }},
    "articles_count": <count of articles>
}}

Query: "{query}"
DataFrame: {df}

Ensure to return JSON directly in the expected format without any additional text or explanation.
"""

- ## Define the LLM chain

In [7]:
prompt = PromptTemplate(
    template=template,
    input_variables=["query", "df"]
)

chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)

- ## Define a function to process user queries

In [8]:
def process_query(query: str, df: pd.DataFrame) -> dict:
    """
    Process a user query to determine article counts for specific divisions and countries.
    
    Parameters:
    query (str): The user query.
    df (pd.DataFrame): The DataFrame to use for fetching article counts.
    
    Returns:
    dict: Processed and validated response.
    """
    response = chain.run({"query": query, "df": df.to_dict(orient='records')})
    return response

- ## Run the application with example queries

In [9]:
user_queries = [
    "Tell me the number of articles published by journalists in the Sports journalism division in the United States",
    "How many articles were published in the Entertainment division in Croatia?",
    "when is the christmas celebrated?",
    "What are the number of articles in crime section in India?"
]

# Process and print results
results = [process_query(query, df) for query in user_queries]
for i in range(len(results)):
    print(f'user_query {int(i+1)}:', user_queries[i])
    print(f'response {int(i+1)}:', results[i], '\n')

user_query 1: Tell me the number of articles published by journalists in the Sports journalism division in the United States
response 1: {'Columns': ['articles_count'], 'Filters': {'Division': 'sports', 'Country': 'USA'}, 'articles_count': 50} 

user_query 2: How many articles were published in the Entertainment division in Croatia?
response 2: {'Columns': ['articles_count'], 'Filters': {'Division': 'entertainment', 'Country': 'Croatia'}, 'articles_count': 0, 'error': 'no data available'} 

user_query 3: when is the christmas celebrated?
response 3: {'error': 'query not relevant to the use case'} 

user_query 4: What are the number of articles in crime section in India?
response 4: {'Columns': ['articles_count'], 'Filters': {'Division': 'crime', 'Country': 'India'}, 'articles_count': 33} 



# -----------------------------------------------------------------------------------------------------------

- ## Create a dataset in LangSmith for application performance evaluation

### Only need to run once
example_inputs = [
    ("Tell me the number of articles published by journalists in the Sports journalism division in the United States", '{"Columns": ["articles_count"], "Filters": {"Division": "sports", "Country": "USA"}, "articles_count": 50}'),
    ("How many articles were published in the Entertainment division in France?", '{"Columns": ["articles_count"], "Filters": {"Division": "entertainment", "Country": "France"}, "articles_count": 0, "error": "no data available"}'),
    ("What is the article count for the Politics division in Germany?", '{"Columns": ["articles_count"], "Filters": {"Division": "politics", "Country": "Germany"}, "articles_count": 0, "error": "no data available"}'),
    ("What are the number of articles in the Crime division in Canada?", '{"Columns": ["articles_count"], "Filters": {"Division": "crime", "Country": "Canada"}, "articles_count": 0, "error": "no data available"}'),
    ("How many articles were published in the Sports division in Japan?", '{"Columns": ["articles_count"], "Filters": {"Division": "sports", "Country": "Japan"}, "articles_count": 0, "error": "no data available"}'),
    ("Tell me the number of articles in the Entertainment division in the UK", '{"Columns": ["articles_count"], "Filters": {"Division": "entertainment", "Country": "UK"}, "articles_count": 22}'),
    ("What is the article count for the Politics division in Italy?", '{"Columns": ["articles_count"], "Filters": {"Division": "politics", "Country": "Italy"}, "articles_count": 0, "error": "no data available"}'),
    ("How many articles were published in the Crime division in India?", '{"Columns": ["articles_count"], "Filters": {"Division": "crime", "Country": "India"}, "articles_count": 33}'),
    ("Tell me the number of articles published by journalists in the Sports division in Great Britain", '{"Columns": ["articles_count"], "Filters": {"Division": "sports", "Country": "UK"}, "articles_count": 30}'),
    ("What are the number of articles in the Economy division in Canada?", '{"Columns": ["articles_count"], "Filters": {"Division": "economy", "Country": "Canada"}, "articles_count": 31}'),
    ("What is the article count for the Politics division in the United States?", '{"Columns": ["articles_count"], "Filters": {"Division": "politics", "Country": "USA"}, "articles_count": 25}'),
    ("How many articles were published in the Crime division in France?", '{"Columns": ["articles_count"], "Filters": {"Division": "crime", "Country": "France"}, "articles_count": 0, "error": "no data available"}'),
    ("Tell me the number of articles in the Sports division in Italy", '{"Columns": ["articles_count"], "Filters": {"Division": "sports", "Country": "Italy"}, "articles_count": 0, "error": "no data available"}'),
    ("What are the number of articles in the Entertainment division in Japan?", '{"Columns": ["articles_count"], "Filters": {"Division": "entertainment", "Country": "Japan"}, "articles_count": 0, "error": "no data available"}'),
    ("What is the article count for the Politics division in the UK?", '{"Columns": ["articles_count"], "Filters": {"Division": "politics", "Country": "UK"}, "articles_count": 15}'),
    ("How to solve rubics cube?", '{"error": "query not relevant to the use case"}'),
    ("What is the weather like in New York?", '{"error": "query not relevant to the use case"}'),
    ("Tell me about the history of the Eiffel Tower.", '{"error": "query not relevant to the use case"}'),
    ("What is the capital of Australia?", '{"error": "query not relevant to the use case"}'),
    ("Who won the last World Cup?", '{"error": "query not relevant to the use case"}')
]

df_dataset = pd.DataFrame(example_inputs, columns=["Question", "Answer"])
df_dataset.head()

input_keys = ["Question"]
output_keys = ["Answer"]

### Create and register dataset
dataset = client.upload_dataframe(
    df=df_dataset,
    input_keys=input_keys,
    output_keys=output_keys,
    name="wh_dataset",
    description="Dataset created from a WH assessment",
    data_type="kv",
)

- ## The evaluation of the application is done in LangSmith console