# 1- Keyphrases Extraction

[pke reference](https://github.com/boudinfl/pke)

In [2]:
# !pip install git+https://github.com/boudinfl/pke.git
# !pip install flashtext

In [3]:
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

In [4]:
with open("/content/egypt.txt") as f:
    text = f.read()

In [5]:
import pke

extractor = pke.unsupervised.TopicRank()

extractor.load_document(input=text, language='en')

extractor.candidate_selection()

extractor.candidate_weighting()

keyphrases = extractor.get_n_best(n=16)

In [6]:
keyphrases = [kp[0] for kp in keyphrases]

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
sentences = tokenize_sentences(text)
keyword_sentence_mapping = get_sentences_for_keyword(keyphrases, sentences)

print(keyword_sentence_mapping)

{'egyptian civilization': ['As Egyptian civilization grew more complex, people took on jobs other than that of a farmer or scribe.', 'The Nile River fed Egyptian civilization for hundreds of years.'], 'egypt': ['As in many ancient societies, much of the knowledge of Egypt came about as priests studied the world to find ways to please the gods.', 'Veins (long streaks) of copper, iron, and bronze were hidden inside desert mountains in the hot Sinai Peninsula, east of Egypt.', 'Because the pharaoh was thought to be a god, government and religion were not separate in ancient Egypt.', 'If Egypt suffered hard times for a long period, the people blamed the pharaoh for angering the gods.', 'For about 500 more years, the kings held Egypt together, but with a much weaker central government.', 'In Egypt, people became slaves if they owed a debt, committed a crime, or were captured in war.', 'Children in Egypt played with toys such as dolls, animal figures, board games, and marbles.', 'The first r

In [10]:
paras=[]
kps = []
for i, j in keyword_sentence_mapping.items():
    paras.append(' '.join(j))
    kps.append(i)

len(paras)==len(kps)

True

In [11]:
len(paras)

16

# 2- Q&A generation using T5 model

[question generation reference](https://github.com/patil-suraj/question_generation/blob/master/notebooks/question_generation.ipynb)

In [13]:
# !pip install -U transformers==3.0.0
# !python -m nltk.downloader punkt
# !git clone https://github.com/patil-suraj/question_generation.git

In [14]:
%cd question_generation
from pipelines import pipeline

nlp = pipeline("question-generation")

/content/question_generation


Downloading:   0%|          | 0.00/627 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/656 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [15]:
new_paras = []
new_kps = []
T5_Q = []
T5_A = []

for i in range(len(kps)):
  try:
    QandA = nlp(paras[i])[0]
    Qs, An = QandA['question'], QandA['answer']
    print(f'Question: {Qs}\nAnswer: {An}\n', '*='*22+'*')
    new_paras.append(paras[i])
    new_kps.append(kps[i])
    T5_Q.append(Qs)
    T5_A.append(An)
  except: continue

  beam_id = beam_token_id // vocab_size


Question: What other job did people take on as Egyptian civilization grew more complex?
Answer: a farmer or scribe
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: Who studied the world to find ways to please the gods?
Answer: priests
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: What river flooded every year?
Answer: the Nile River
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: What word meant “great house”?
Answer: pharaoh
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: What was the name of the tomb of Egypt's first rulers?
Answer: mud brick
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: What was replaced by a pyramid of brick or stone?
Answer: mud brick
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: Who bathed before entering a temple?
Answer: a priest
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Question: Who wanted a monument that would show the world how great he was?
Answer: Khufu
 *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=

In [None]:
# The same way but with format

# for i, j in keyword_sentence_mapping.items():
#     sen = ' '.join(j)
#     frmt = f'<answer>{i}<context>{sen}'
#     try:print(nlp(frmt))
#     except: continue

In [16]:
import pandas as pd

dic = dict()
dic['keyphrase'] = new_kps
dic['passage'] = new_paras
dic['T5_Q'] = T5_Q
dic['T5_A'] = T5_A

df = pd.DataFrame(dic)

df.head()

Unnamed: 0,keyphrase,passage,T5_Q,T5_A
0,egyptian civilization,"As Egyptian civilization grew more complex, pe...",What other job did people take on as Egyptian ...,a farmer or scribe
1,egypt,"As in many ancient societies, much of the know...",Who studied the world to find ways to please t...,priests
2,nile river,"Unlike the Tigris and Euphrates, the Nile Rive...",What river flooded every year?,the Nile River
3,word pharaoh,"The word pharaoh meant “great house,” and it w...",What word meant “great house”?,pharaoh
4,underground tomb,The first rulers of Egypt were often buried in...,What was the name of the tomb of Egypt's first...,mud brick


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
df.to_csv('/content/T5_model.csv', columns=df.columns, index=False)