# Data Collection

The purpose of this notebook is to document the process that was used to create the initial data set for this project.

## Scraping From Lotsawa House

Data was scraped primarily from [Lotsawa House](https://www.lotsawahouse.org/)

The code below is a selenium script which iteratively opens the 'topic' collections from Lotsawa House. Each topic typically has a full collection pdf available. However, that PDF is not available at a particular URL until it is requested by clicking the 'PDF' link on each topic page. 

This script clicks the link then redirects the browser to the URL of the PDF version of the collected texts for that topic. The script then downloads that PDF. Note that this code relies on an untracked '.txt' file that contains a list of topics featured on the website. The script also provides a certain amount of sanitization on the text of that list to account for non-standard characters and diacriticals common to transliteration of Tibetan and Sanskrit.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import time

def download_pdf(driver, topic):
    driver.get('https://www.lotsawahouse.org/topics/'+topic+'/')

    driver.implicitly_wait(3)

    driver.execute_script("window.scrollTo(0, 400)")


    # click pdf link so url will be active
    pdf_link = driver.find_element(By.LINK_TEXT, 'PDF')

    pdf_link.click()

    time.sleep(10)

    # download pdf from active url
    url = 'https://www.lotsawahouse.org/Cgi/make-ebook-cgi.pl?lang=english&path=topics%2F'+topic+'&format=PDF&do=download'

    response = requests.get(url)

    with open('MLotsawa/data/lotsawahouse/topic-pdfs/'+topic+'.pdf', 'wb') as f:
        f.write(response.content)

# window to webpage
driver = webdriver.Chrome()

topic_list = []
remaining = []

with open('MLotsawa/data/lotsawahouse/topic-list.txt', 'r') as f:
    for line in f:
        topic = line.lower().replace(' ', '-').replace('&', '').replace('--', '-').replace('\'', '')
        topic = topic.replace('ā','a').replace('ḍ', 'd').replace('é', 'e').replace('ī','i').replace('ṃ','m').replace('ṇ', 'n').replace('ñ','n').replace('ö', 'o').replace('ś', 'sh').replace('ṣ','sh').replace('ü', 'u').replace('ū', 'u')
        topic_list.append(topic)

for topic in topic_list:
    try:
        download_pdf(driver, topic)
    except:
        remaining.append(topic)

driver.close()

with open('MLotsawa/data/lotsawahouse/remaining-topics.txt', 'w') as f:
    f.writelines(remaining)

## Ankhi and Microsoft Translator

A small proportion of the training data was sourced from common phrases for language learners provided from crowd sourcing on the language learning platorm Ankhi. This data is easily available online at present, however, there are no sets of English sentences paired with Tibetan. To attempt to put together a comparable set of sentence pairs, I programmatically translate English sentences from Ankhi datasets into Tibetan using Microsoft Translator. This process was relatively laborious and produced data of relatively poor quality.

In [None]:
import requests, uuid, json
import pyewts
import time

# open bing api key
with open('bing-key.txt', 'r') as f:
    key = f.read()
key=key.replace('\n', '')


# this function passes sets of sentences to the microsoft translator api
def translate(key, num, batch_num):
    # Add your key and endpoint

    endpoint = "https://api.cognitive.microsofttranslator.com"


    # location, also known as region.
    # required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
    location = "eastus"

    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'from': 'en',
        'to': 'bo',
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        # location required if you're using a multi-service or regional (not global) resource.
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    # You can pass more than one object in body.
    body = []

    with open('MLotsawa/data/ankhi/bing-batches/mini-batches/'+str(num)+'/'+str(batch_num)+'.txt', 'r') as f:
        for line in f:
            body.append({'text': line})

    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()
    #print(json.dumps(response, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')))
    with open('MLotsawa/data/ankhi/bing-translations/batch'+str(num)+'-'+str(batch_num)+'-response.json', 'w', encoding='utf-8') as f:
        json.dump(response, f, ensure_ascii=False, indent=4)

Microsoft Translator provides translations into Tibetan in the Tibetan script. I converted this into Wylie transliterations with the following code.

In [None]:
def transliterate(num, batch_num):
    converter=pyewts.pyewts()

    with open('MLotsawa/data/ankhi/bing-translations/batch'+str(num)+'-'+str(batch_num)+'-response.json') as f:
        d = json.load(f)

    tib = []

    for entry in d:
        tib.append(converter.toWylie(entry['translations'][0]['text']))

    eng = []

    with open('MLotsawa/data/ankhi/bing-batches/mini-batches/'+str(num)+'/'+str(batch_num)+'.txt', 'r') as f:
        for line in f:
            eng.append(line)

    pairs = []

    for i in range(len(tib)):
        pairs.append((tib[i], eng[i]))

    with open('MLotsawa/data/ankhi/pairs/batch1'+str(num)+'-'+str(batch_num)+'-pairs.txt', 'w') as f: # make sure to change to 'a'
        f.writelines('\n'.join(str(pair).replace('\'', '')
                                        .replace('\\n', '')
                                        .replace('(', '')
                                        .replace(')', '')
                                        .replace('/', '')
                                        .replace(' ,', ',')
                                        .replace('"', '')  for pair in pairs))

The process was summarized and performed iteratively to be saved as sentence pairs with the following code.

In [None]:
for j in range(109,119):
    for i in range(0,10):
        batch_num = i
        translate(key, j, batch_num)
        transliterate(j, batch_num)

import os

for file in os.listdir('/data/ankhi/pairs/'):
    path = '/data/ankhi/pairs/' + file
    with open(path, 'r') as f:
        text = f.readlines()
        with open('/data/ankhi/all-ankhi-pairs.txt', 'a') as g:
            g.write('\n')
            g.writelines(text)

## Converting PDFs to txt

The translations come in bilingual pdfs which need to be converted to a usable .txt file format.

In [None]:
from PyPDF2 import PdfReader
import os

path = '/home/j/Documents/Projects/Iron-Bridge/lotsawa/data/lotsawahouse/topic-pdfs'

def pdf_to_txt(file):
    reader = PdfReader(file)

    num_pages = len(reader.pages)

    text = []

    for page in reader.pages:
        text.append(page.extract_text())


    return text

for file in os.listdir(path):
    text = pdf_to_txt(path + '/' + file)
    with open('/home/j/Documents/Projects/Iron-Bridge/lotsawa/data/lotsawahouse/topic-txts/' + file[:-4] + '.txt', 'w') as f:
        f.writelines('\n'.join(text))

## Split text into sentence pairs

Now that we've wittled the text down we can set the text into Tibetan and English sentence pairs. Lotsawa House translations are conveniently provided in multiple lines. First Tibetan and then the English translation.

In [None]:
from spacy_language_detection import LanguageDetector
import spacy
import os

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp_model = spacy.load("en_core_web_md")
spacy.language.Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)

pairs = []

def detect(text):
    doc = nlp_model(text)
    detect_language = doc._.language
    lang = detect_language['language']
    return lang

def separate_pairs(file):
    with open(file, 'r') as f:
        text = f.readlines()
        
        for i in range(len(text) - 1):
            lang = detect(text[i])
            if lang != "en":
                next_lang = detect(text[i+1])
                if next_lang == "en":
                    pair = (text[i].replace('\n', '') + ',' + text[i+1])
                    pairs.append(pair)

    with open('/home/j/Documents/Projects/Iron-Bridge/lotsawa/data/lotsawahouse/all-lotsawahouse-pairs.txt', 'a') as f:
        f.write('\n')
        f.writelines(pairs)

path = '/home/j/Documents/Projects/Iron-Bridge/lotsawa/data/lotsawahouse/topic-txts/'

for file in os.listdir('/home/j/Documents/Projects/Iron-Bridge/lotsawa/data/lotsawahouse/topic-txts'):
    separate_pairs(path+file)