# Prepare pre-trained models via Google Drive
## In your 'My Drive': 
### 1. Create a directory named 'SQUASH'.
### 2. In 'SQUASH', download and store directories mentioned along with its content:

#### [gpt2_corefs_question_generation](https://drive.google.com/drive/folders/1HEbm_sHDAAcylKIF4vIvZ9N2jEA7I5Em)

#### [bert_large_qa_model](https://drive.google.com/drive/folders/1D3fIPuwn0C0zIMg29QSKcnSAc8HfNemd)

### 3.(Optional) Save the input Pickle file into the 'SQUASH' directory

 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Setup

In [None]:
import json, pickle, os
import pandas as pd
from google.colab import files


! git clone https://github.com/amanapte/squash-generation
! pip install pytorch-pretrained-bert pytorch-ignite simplejson
! cd /content/squash-generation/pytorch-pretrained-BERT/ && pip install --editable .
#! cd /content/squash-generation/ && pip install -r requirements.txt

! cp -r /content/drive/'My Drive'/SQUASH/gpt2_corefs_question_generation /content/squash-generation/question-generation/
! cp -r /content/drive/'My Drive'/SQUASH/bert_large_qa_model /content/squash-generation/question-answering/

# Path to temp directory
temp_path = "/content/squash-generation/squash/temp/"

# Path to queue file
queue_path = temp_path + "queue.txt"

# Remove existing files in above mentioned directories
! rm -rf /content/squash-generation/squash/temp/quac_869/
! rm /content/squash-generation/squash/temp/README.md
! rm /content/squash-generation/squash/final/README.md

sep = "------------------------------------------------------"


# Run custom paragraph through SQUASH

In [None]:
# Paragraph to run
chunk_para = """Coffee is a brewed drink prepared from roasted coffee beans, the seeds of berries from certain Coffea species. Once ripe, coffee berries are picked while green and unripe, processed, and dried. Dried coffee seeds (referred to as beans") are roasted to varying degrees, depending on the desired flavor. Roasted beans are ground and then brewed with near-boiling water to produce the beverage known as coffee. Coffee is darkly colored, bitter, slightly acidic and has a stimulating effect in humans, primarily due to its caffeine content. It is one of the most popular drinks in the world, and can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte). It is usually served hot, although iced coffee is common. Clinical research indicates that moderate coffee consumption is benign or mildly beneficial as a stimulant in healthy adults, with continuing research on whether long-term consumption reduces the risk of some diseases, although those long-term studies are generally of poor quality. The earliest credible evidence of coffee-drinking as the modern beverage appears in modern-day Yemen in southern Arabia in the middle of the 15th century in Sufi shrines where coffee seeds were first roasted and brewed in a manner similar to how it is now prepared for drinking. The Yemenis procured the coffee beans from the Ethiopian Highlands and began cultivation. By the 16th century, the drink had reached the rest of the Middle East and North Africa, later spreading to Europe. The two most commonly grown are C. arabica and C. robusta. Coffee plants are now cultivated in over 70 countries, primarily in the equatorial regions of the Americas, Southeast Asia, the Indian subcontinent, and Africa. As of 2018, Brazil was the leading grower of coffee beans, producing 35% of the world total. Coffee is a major export commodity as the leading legal agricultural export for numerous countries. It is one of the most valuable commodities exported by developing countries. Green, unroasted coffee is one of the most traded agricultural commodities in the world. The way developed countries trade coffee with developing nations has been criticised, as well as the impact on the environment with regards to the clearing of land for coffee-growing and water use. Consequently, the markets for fair trade and organic coffee are expanding."""
# chunk_para = ""

para_list = chunk_para.split()
clean_chunk_para = " ".join(para_list)
dump_para = json.dumps(clean_chunk_para)
dump_key = json.dumps("Custom")

# SQUASH settings
top_p = 0.9
gen_frac = 0.5
spec_frac = 0.8

json_dict = json.loads("""{"input_text": """ + dump_para + """, "key": """ + dump_key + """, "timestamp": "2020-06-17 10:55:12.201741", "settings": {"top_p": """ + str(top_p) + """,  "gen_frac": """ + str(gen_frac) + """, "spec_frac": """ + str(spec_frac) + """}}""")
json_cont = json.dumps(json_dict, indent = 4, sort_keys=True)

chunk_int_path = temp_path + "Custom/"
os.makedirs(chunk_int_path)
json_file_path = chunk_int_path + "metadata.json"
output_file = open(json_file_path, "w")
output_file.write(json_cont)
output_file.close()
    
queue_file = open(queue_path, "w")
queue_file.write(dump_key)
queue_file.close()

# Run Squash pipeline
! cd /content/squash-generation/ && bash squash/pipeline_QA.sh

# Print output JSON
! cat /content/squash-generation/squash/final/Custom.json

# # Download all output JSONs in a zip
# ! cd /content/squash-generation/squash && zip -r ./Custom.zip ./final/ ./temp/Custom/
# files.download('/content/squash-generation/squash/Custom.zip')

### Run this cell before re-running the custom paragraph cell above.

In [6]:
! rm -rf /content/squash-generation/squash/temp/Custom
! rm /content/squash-generation/squash/final/Custom.json

# Run thorugh KB database and generate input JSONs 

In [None]:
! cp /content/drive/'My Drive'/SQUASH/Chunks.pkl /content/

# Path to Pickle chunk file
chunk_path = "/content/Chunks.pkl"

# Read Pickle chunk file into Pandas
chunk_file = pd.read_pickle(chunk_path)

article_count = len(chunk_file)

for i in range(0, article_count - 1):
    # Kb_article = ((chunk_file['KB Article'][i:i+1].values))[0]
    chunk = ((chunk_file['Chunks'][i:i+1].values))[0]
    chunk_para = ""

    for j in range(0, len(chunk)):
        chunk_para += str(chunk[j])
    para_list = chunk_para.split()
    clean_chunk_para = " ".join(para_list)
    dump_para = json.dumps(chunk_para)
    dump_key = json.dumps(str(i+1))
    json_dict = json.loads(r"""{"input_text": """ + dump_para + """, "key": """ + dump_key + """, "timestamp": "2020-06-17 10:55:12.201741", "settings": {"top_p": 0.9,  "gen_frac": 0.5, "spec_frac": 0.8}}""")
    json_cont = json.dumps(json_dict, indent = 4, sort_keys=True)
    chunk_int_path = temp_path + str(int(i + 1)) + "/"
    os.makedirs(chunk_int_path)
    json_file_path = chunk_int_path + "metadata.json"
    output_file = open(json_file_path, "a")
    output_file.write(json_cont)
    output_file.close()
    
queue_file = open(queue_path, "a")
for i in range(1, article_count):
    queue_file.write(str(i))
    queue_file.write("\n")
queue_file.close()

! cd /content/squash-generation/ && bash squash/pipeline_demo.sh

# Create input JSONs with different settings for a single article 

In [None]:
! cp /content/drive/'My Drive'/SQUASH/Chunks.pkl /content/

# Path to Pickle chunk file
chunk_path = "/content/Chunks.pkl"

# Read Pickle chunk file into Pandas
chunk_file = pd.read_pickle(chunk_path)

i = 0

para_list = []

Kb_article = ((chunk_file['KB Article'][i:i+1].values))[0]
chunk = ((chunk_file['Chunks'][i:i+1].values))[0]
chunk_para = ""

for j in range(0, len(chunk)):
    chunk_para += str(chunk[j])
    para_list = chunk_para.split()
    clean_chunk_para = " ".join(para_list)
    dump_para = json.dumps(chunk_para)
    dump_key = json.dumps(str(i+1))

# top_p = 0.99
# gen_frac = 0.6
# spec_frac = 0.8

# subtr = 0.01
# k = 1

for top_p in range(75, 100, 5):
    for gen_frac in range(30, 60, 5):
        for spec_frac in range(50, 90, 5):
            json_dict = json.loads(r"""{"input_text": """ + dump_para + """, "key": """ + dump_key + """, "timestamp": "2020-06-17 10:55:12.201741", "settings": {"top_p": """ + str(float(top_p/100)) + """,  "gen_frac": """ + str(float(gen_frac/100)) + """, "spec_frac": """ + str(float(spec_frac/100)) + """}}""")
            json_cont = json.dumps(json_dict, indent = 4, sort_keys=True)
            chunk_int_path = temp_path + str(int(i + 1)) + "/"
            os.makedirs(chunk_int_path)
            json_file_path = chunk_int_path + "metadata.json"
            output_file = open(json_file_path, "a")
            output_file.write(json_cont)
            output_file.close()
            i += 1
    
queue_file = open(queue_path, "a")
for q in range(1, i + 1):
    queue_file.write(str(q))
    queue_file.write("\n")
queue_file.close()

! cd /content/squash-generation/ && bash squash/pipeline_demo.sh

# Delete and remake the temp and final directories
## Run this cell if you want to run SQUASH pipeline after it has been executed once.

In [None]:
# ! rm -rf /content/squash-generation/squash/temp/
# ! rm -rf /content/squash-generation/squash/final/
# ! mkdir /content/squash-generation/squash/temp/
# ! mkdir /content/squash-generation/squash/final/

# Format of input json file

In [None]:
# {
#     "input_text": "Place your text here.",
#     "key": "Key",
#     "timestamp": "2020-06-17 10:55:12.201741",
#     "settings": {
#         "top_p": 0.9,
#         "gen_frac": 0.5,
#         "spec_frac": 0.8
#     }
# }