# 1- Keyphrases Extraction

[pke reference](https://github.com/boudinfl/pke)

In [3]:
# !pip install git+https://github.com/boudinfl/pke.git
# !pip install flashtext

In [4]:
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

In [5]:
with open("/content/egypt.txt") as f:
    text = f.read()

In [36]:
import pke

extractor = pke.unsupervised.TopicRank()

extractor.load_document(input=text, language='en')

extractor.candidate_selection()

extractor.candidate_weighting()

keyphrases = extractor.get_n_best(n=10)

In [37]:
keyphrases = [kp[0] for kp in keyphrases]

In [38]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
sentences = tokenize_sentences(text)
keyword_sentence_mapping = get_sentences_for_keyword(keyphrases, sentences)

print(keyword_sentence_mapping)

{'egyptian civilization': ['As Egyptian civilization grew more complex, people took on jobs other than that of a farmer or scribe.', 'The Nile River fed Egyptian civilization for hundreds of years.'], 'egypt': ['As in many ancient societies, much of the knowledge of Egypt came about as priests studied the world to find ways to please the gods.', 'Veins (long streaks) of copper, iron, and bronze were hidden inside desert mountains in the hot Sinai Peninsula, east of Egypt.', 'Because the pharaoh was thought to be a god, government and religion were not separate in ancient Egypt.', 'If Egypt suffered hard times for a long period, the people blamed the pharaoh for angering the gods.', 'For about 500 more years, the kings held Egypt together, but with a much weaker central government.', 'In Egypt, people became slaves if they owed a debt, committed a crime, or were captured in war.', 'Children in Egypt played with toys such as dolls, animal figures, board games, and marbles.', 'The first r

In [40]:
paras=[]
kps = []
for i, j in keyword_sentence_mapping.items():
    paras.append(' '.join(j))
    kps.append(i)

len(paras)==len(kps)

True

In [41]:
len(paras)

10

# 2- Q&A generation using haystack model

[haystack reference 1](https://github.com/deepset-ai/haystack)

[haystack reference 2](https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/03_Basic_QA_Pipeline_without_Elasticsearch.ipynb)

[haystack reference 3](https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/13_Question_generation.ipynb)

In [12]:
# !pip install --upgrade pip
# !pip install git+https://github.com/deepset-ai/haystack.git

# !wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
# !tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
# !chown -R daemon:daemon elasticsearch-7.9.2

In [42]:
import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)

! sleep 30

In [43]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [44]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es, print_questions, export_utils


launch_es()



In [70]:
docs = [{"content": para} for para in paras]
len(docs)

10

In [46]:
document_store = ElasticsearchDocumentStore()
document_store.delete_documents()
document_store.write_documents(docs)

question_generator = QuestionGenerator()

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
Using sep_token, but it is not set yet.


In [47]:
retriever = BM25Retriever(document_store=document_store)
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)

reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.infer:Got ya 2 parallel workers to do inference ...
INFO:haystack.modeling.infer: 0     0  
INFO:haystack.modeling.infer:/w\   /w\ 
INFO:haystack.modeling.infer:/'\   / \ 


In [77]:
data = {}

data['keyphrases'] = []
data["context"] = []
data["query"] = []
data["answer"] = []

In [78]:
for idx, document in enumerate(tqdm(document_store)):

    print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
    result = qag_pipeline.run(documents=[document])
    # print_questions(result)
    for q in result['queries']:
      data["query"].append(q)
    for a in result['answers']:
      data['answer'].append(a[0].answer)
    for c in result['documents']:
      data["context"].append(c[0].content)

0it [00:00, ?it/s]


 * Generating questions and answers for document 0: As Egyptian civilization grew more complex, people took on jobs other than that of a farmer or scrib...




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 27.14 Batches/s]
1it [00:01,  1.13s/it]


 * Generating questions and answers for document 1: As in many ancient societies, much of the knowledge of Egypt came about as priests studied the world...




Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.34 Batches/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.33 Batches/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.27 Batches/s]
2it [00:03,  2.14s/it]


 * Generating questions and answers for document 2: Unlike the Tigris and Euphrates, the Nile River flooded at the same time every year, so farmers coul...




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 23.62 Batches/s]
3it [00:04,  1.60s/it]


 * Generating questions and answers for document 3: The word pharaoh meant “great house,” and it was originally used to describe the king’s palace....




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 29.46 Batches/s]
4it [00:06,  1.44s/it]


 * Generating questions and answers for document 4: The first rulers of Egypt were often buried in an underground tomb topped by mud brick....




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.55 Batches/s]
5it [00:06,  1.18s/it]


 * Generating questions and answers for document 5: They replaced the mud brick with a small pyramid of brick or stone....




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.43 Batches/s]
6it [00:07,  1.04it/s]


 * Generating questions and answers for document 6: Before entering a temple, a priest bathed and put on special linen garments and white sandals. One o...




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 24.69 Batches/s]
7it [00:08,  1.17it/s]


 * Generating questions and answers for document 7: Cities emerge as centers of culture and power, and people learn to do jobs that do not involve agric...




Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.96 Batches/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.17 Batches/s]
8it [00:09,  1.15s/it]


 * Generating questions and answers for document 8: About 80 years later, a pharaoh named Khufu decided he wanted a monument that would show the world h...




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.99 Batches/s]
9it [00:11,  1.21s/it]


 * Generating questions and answers for document 9: The word pharaoh meant “great house,” and it was originally used to describe the king’s palace. Abou...




Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 16.47 Batches/s]
10it [00:12,  1.22s/it]


In [79]:
for i in data["context"]:
  keyphrase = []
  for j in kps:
    if j.lower() in i.lower():
       keyphrase.append(j)
  data['keyphrases'].append(keyphrase)

### create and save the dataframe

In [80]:
import pandas as pd

df = pd.DataFrame(data)

df.head()

Unnamed: 0,keyphrases,context,query,answer
0,"[egyptian civilization, egypt, nile river, people, years]","As Egyptian civilization grew more complex, people took on jobs other than t...",What river fed Egyptian civilization for hundreds of years?,The Nile River
1,"[egypt, underground tomb, priest, people, years, king]","As in many ancient societies, much of the knowledge of Egypt came about as p...",When did priests study the world to find ways to please the gods?,"ancient societies, much of the knowledge of Egypt"
2,"[egypt, underground tomb, priest, people, years, king]","As in many ancient societies, much of the knowledge of Egypt came about as p...","Where were veins of copper, iron, and bronze hidden?",desert mountains
3,"[egypt, underground tomb, priest, people, years, king]","As in many ancient societies, much of the knowledge of Egypt came about as p...",Who was the pharaoh?,a god
4,"[egypt, underground tomb, priest, people, years, king]","As in many ancient societies, much of the knowledge of Egypt came about as p...",What peninsula is east of Egypt?,Sinai Peninsula


In [81]:
df.to_csv('/content/haystack_model.csv', columns=df.columns, index=False)