<a href="https://colab.research.google.com/github/alejvz/UNSTRUCTURED-DATA-NLP/blob/master/Q%26A_Models_for_relevant_info_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Relevant information extraction
## 1. Regex extraction

In [None]:
import re

email_body = """
From: satoshi.nakamoto@pm.me
To: jack@pm.me
Hi Jack,
I'm sorry, I've been distracted by my new Tesla car company, and I'm afraid I'm going to have to delegate the leadership of the bitcoin project to someone else.
I need someone really really skilled. Could you advise me on who to appoint? Elon Musk is too busy already!
Many thanks,
Sat.

P.S.: I've got a Tesla Roadster too!
 - 
Satoshi Nakamoto
Phone number: +1 (650) 566–1191
Bitcoin: 1GttzecjYm19xu3iC8i8NEuM7mB5uZQbKD
"""

regex_expr = r"(?:[A-Z][a-z]+)+(?:[\- ][A-Z](?:[a-z\.]+)?)+"

re.findall(regex_expr, email_body)

## 2. spaCy

In [None]:
!pip3 install spacy==3.1

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
import spacy
import re

email_body = """
From: satoshi.nakamoto@pm.me
To: jack@pm.me
Hi Jack,
I'm sorry, I've been distracted by my new Tesla car company, and I'm afraid I'm going to have to delegate the leadership of the bitcoin project to someone else.
I need someone really really skilled. Could you advise me on who to appoint? Elon Musk is too busy already!
Many thanks,
Sat.

P.S.: I've got a Tesla Roadster too!
 - 
Satoshi Nakamoto
Phone number: +1 (650) 566–1191
Bitcoin: 1GttzecjYm19xu3iC8i8NEuM7mB5uZQbKD
"""

# Loading spaCy 3.1 transformer model ('en_core_web_sm','en_core_web_md',
# 'en_core_web_lg') for lower versions
nlp = spacy.load('en_core_web_trf')
doc = nlp(email_body)

entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
entities

## 3. Q&A Models

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline
import numpy as np

# Instatiate the model from checkpoint
model_checkpoint = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = pipeline(
    'question-answering',
    model=model_checkpoint,
    tokenizer=model_checkpoint
)

In [None]:
email_body = """
Hi Jack,
I'm sorry, I've been distracted by my new Tesla car company, and I'm afraid I'm going to have to delegate the leadership of the bitcoin project to someone else.
I need someone really really skilled. Could you advise me on who to appoint? Elon Musk is too busy already!
Many thanks,
Sat.

P.S.: I've got a Tesla Roadster too!
---
Satoshi Nakamoto
Phone number: +1 (650) 566–1191
Bitcoin: 1GttzecjYm19xu3iC8i8NEuM7mB5uZQbKD
"""

questions = [
  "What is his name?",
  "What is her name?",
  "What is their name?",
  "Who sent this?",
  "What is the name of the person who sent this?"
]

answers = model(
    context=email_body,
    question=questions,
    topk=2
)

unique_answers = {}

for a in answers:
  if a["answer"] in unique_answers:
    unique_answers[a["answer"]] += a["score"]
  else:
    unique_answers[a["answer"]] = a["score"]

# Ordering by most score
result = [(a, s) for (a,s) in unique_answers.items()]
result.sort(key=lambda tup: tup[1], reverse=True)

# Regex checking
regex_expr = r"(?:[A-Z][a-z]+)+(?:[\- ][A-Z](?:[a-z\.]+)?)+"
#result = [r for r in result if re.match(regex_expr, r[0])]

# Normalizing probabilities
scores_avg = np.sum([r[1] for r in result])
result = [(r[0], r[1]/scores_avg) for r in result]

result