# Facutal Information Extraction QA from JSON file

## New WORKFLOW
1. Extract Company Name: NER Model
2. Get ticker name
3. Call Yahoo API for ticker infomation
4. Embedding query and key
5. pick the key with highest cosime similarity
6. Answer would be the value corresponding to the key 

In [None]:
!pip install yfinance
!pip install sentence_transformers
!pip install transformers

In [13]:
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import re
import yfinance as yf
import requests

In [14]:
embed_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
ner_model = "Jean-Baptiste/roberta-large-ner-english" # "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=ner_model, aggregation_strategy="simple"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [15]:
def get_ticker_name(company_name):
    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    params = {"q": company_name, "quotes_count": 1, "country": "United States"}

    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
    data = res.json()

    company_code = data['quotes'][0]['symbol']
    return company_code

In [16]:
def get_ticker(query):
    tokens = token_classifier(query)
    company_name = None
    for token in tokens:
        if token['entity_group'] == 'ORG':
            company_name = token['word']
    if not company_name:
        print(f"no company name found in the query: {query}")
        return (None, None)
    tikername = get_ticker_name(company_name)
    ticker = yf.Ticker(tikername)
    return ticker, company_name

In [17]:
def get_key(query, keys):
  max_key, max_cos = '', 0
  for key in keys:
    embedding_1= embed_model.encode(query, convert_to_tensor=True)
    if key[-2:] == "PE":
        new_key = f'{key[:-2]} {key[-2:]}'
    else:
        new_key=re.sub( r"([A-Z])", r" \1", key).split()
        new_key=" ".join(new_key)
        new_key=new_key.lower()
    embedding_2 = embed_model.encode('the ' + new_key + 'of Tesla', convert_to_tensor=True)
    score = util.pytorch_cos_sim(embedding_1, embedding_2)
    if score[0][0] > max_cos:
      max_key, max_cos = key, score
  return max_key, max_cos

In [18]:
def FactualQA(query):
    ticker, company_name = get_ticker(query)
    if ticker is None:
      return "Unable to detect company name"
    info_dict = dict(ticker.info)
    key, score = get_key(query, info_dict.keys())
    answer = info_dict[key] if info_dict[key] is not None else "unavailable"
    return f"Answer: '{answer}', score: {score.numpy()}"

In [19]:
FactualQA('What is the full time employees of Tesla?')



"Answer: '127855', score: [[0.8278109]]"

## EVALUATION

In [20]:
company_list = ['Tesla']
def create_qa_pairs():
    qa_pairs = []
    for company in company_list:
        tikername = get_ticker_name(company)
        ticker_info = dict(yf.Ticker(tikername).info)
        for key,val in ticker_info.items():
            if key == 'companyOfficers':
                question = f"How many officers are in {company}?"
                answer = "unavailable" if val is None else str(val)
                qa_pairs.append((question, answer))
                continue
            if key[-2:] == "PE":
              key = f'{key[:-2]} {key[-2:]}'
            else:
              key=re.sub( r"([A-Z])", r" \1", key).split()
              key=" ".join(key)
              key=key.lower()
            question = f"What is the {key} of {company}?"
            answer = "unavailable" if val is None else str(val)
            qa_pairs.append((question, answer))
    return qa_pairs

In [21]:
dataset = create_qa_pairs()
dataset



[('What is the zip of Tesla?', '78725'),
 ('What is the sector of Tesla?', 'Consumer Cyclical'),
 ('What is the full time employees of Tesla?', '127855'),
 ('What is the long business summary of Tesla?',
  'Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. It operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehicle sales, a network of Tesla Superchargers, and in-app upgrades; purchase financing and leasing services; services for electric vehicles through its company-owned service locations and Tesla mobile service technicians; and vehicle limited warranties a

In [None]:
acc = 0
errors = []
for question, anwser in dataset:
  predict = FactualQA(question)
  if anwser in predict:
      acc += 1
  else:
    errors.append((question, anwser, predict))
num = len(dataset)

In [23]:
print(acc/num)
print(errors)

0.9047619047619048
[('What is the zip of Tesla?', '78725', 'Unable to detect company name'), ('What is the city of Tesla?', 'Austin', 'Unable to detect company name'), ('What is the gross profits of Tesla?', '20853000000', "Answer: '0.15413', score: [[0.83484954]]"), ('What is the target high price of Tesla?', '300', "Answer: '186.61', score: [[0.86713934]]"), ('What is the total cash of Tesla?', '22185000960', "Answer: '7.011', score: [[0.87282354]]"), ('What is the long name of Tesla?', 'Tesla, Inc.', 'Unable to detect company name'), ('What is the shares outstanding of Tesla?', '3164100096', "Answer: '0.028499998', score: [[0.81111366]]"), ('What is the book value of Tesla?', '14.129', "Answer: '15.16314', score: [[0.8181623]]"), ('What is the shares short prior month of Tesla?', '80186849', "Answer: '1672358400', score: [[0.8579195]]"), ('What is the algorithm of Tesla?', 'unavailable', 'Unable to detect company name'), ('What is the ask of Tesla?', '214.17', "Answer: '215.93', sco

## OLD WORKFLOW
1. Extract Company Name: NER Model
2. Get ticker name
3. Call Yahoo API for ticker infomation
4. Create context: Verbolize Json File
5. Feed query and context into QA Model

In [None]:
!pip install yfinance
!pip install transformers

In [None]:
ner_model = "Jean-Baptiste/roberta-large-ner-english" # "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=ner_model, aggregation_strategy="simple"
)
qa_model = "distilbert-base-cased-distilled-squad"
question_answerer = pipeline("question-answering", model=qa_model)

In [None]:
def get_ticker_name(company_name):
    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    params = {"q": company_name, "quotes_count": 1, "country": "United States"}

    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
    data = res.json()

    company_code = data['quotes'][0]['symbol']
    return company_code

In [None]:
def get_ticker(query):
    tokens = token_classifier(query)
    company_name = None
    for token in tokens:
        if token['entity_group'] == 'ORG':
            company_name = token['word']
    if not company_name:
        print(f"no company name found in the query: {query}")
        return (None, None)
    tikername = get_ticker_name(company_name)
    ticker = yf.Ticker(tikername)
    return ticker, company_name

In [None]:
def verbalize(company, info_dict):
  res = []
  for key,val in info_dict.items():
    if key == 'longBusinessSummary':
      res.append(val)
    else:
      if val is None: 
        val = "unavailable"
      elif key[-2:] == "PE":
        key = f'{key[:-2]} {key[-2:]}'
      elif key == 'companyOfficers':
        res.append(f"The officers of {company} are {', '.join(val)}.")
      else:
        key=re.sub( r"([A-Z])", r" \1", key).split()
        key=" ".join(key)
        key=key.lower()
      res.append(f"The {key} of {company} is {str(val)}.")
  
  res.append(f"The {company} is located in {info_dict['address1']}, {info_dict['city']}, {info_dict['state']}, {info_dict['country']} {info_dict['zip']}.")
  return " ".join(res)


In [None]:
def QA(query):
    ticker, company_name = get_ticker(query)
    if ticker is None:
      return "Unable to detect company name"
    context = verbalize(company_name, dict(ticker.info))
    result = question_answerer(question=query, context=context)
    return f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"