# Downloading Tora with metadata
This notebook Downloads data from tora.ws site for future work

In [1]:
import json
import xmltodict
import requests
import os

In [2]:
url_format = 'http://mobile.tora.ws/xml/{0}.xml'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

books = ['בראשית', 'שמות ', 'ויקרא ', 'במדבר', 'דברים']
double_chaps = [
    "פרשת ויקהל פקודי",
    "פרשת תזריע מצורע",
    "פרשת אחרי מות-קדושים",
    "פרשת בהר-בחקותי",
    "פרשת חקת-בלק",
    "פרשת מטות-מסעי",
    "פרשת נצבים-וילך"
]
with open('links.json', 'r', encoding='utf8') as file:
    links = json.loads(file.read())["תנ''ך"]["תורה"]["תורה לפי פרשיות (שניים מקרא וא' תר')"]

In [3]:
#Function that deals with gimatria - the Jewish numerology
def gimatria(letters):
  alphabet = 'אבגדהוזחטיכלמנסעפצקרשת'
  result = 0
  try:
    for l in letters:
      n = alphabet.index(l) + 1
      d = int(n/10)
      result += (int(n%10) + d) * (10 ** d)
  except:
    print('Wrong parameter, hebrew letters only expected! Got: ', letters)
  return result

In [5]:
for book in books:
    path = os.path.join('tora', book + '.json')
    book_data = {
        'name': book,
        'chapters': dict(),
        'weekly_chaps': dict(),
        'double_chaps': list(),
        'psukim': list(),
        'onkelos': list()
    }
    chaps = links[book]
    for chap in chaps:
        chap_data = {
            'from_to': list(),
            'aliyot': dict()
        }
        data = xmltodict.parse(
            requests.get(url_format.format(chaps[chap]['nid']), 
            headers=headers).content.decode('utf-8', 'ignore'))['book']['chap']
        #As I have recognized, there is wrong data, so now it will be checked
        #and then will be corrected by hand
        try:
            chap_data['from_to'] = [data[0]['p'][0]['@n'], data[-1]['p'][-1]['@n']]
        except KeyError:
            chap_data['from_to'] = [data[1]['p'][1]['@n'], data[-1]['p'][-1]['@n']]
            #Find out where is damaged chapter
            print(chap)
        for alia in data:
            if 'p' in alia.keys():
                chap_data['aliyot'].update({alia['@n']: [alia['p'][0]['@n'], alia['p'][-1]['@n']]})
                if chap in double_chaps:
                    book_data['double_chaps'].append(chap)
                    continue
                for pasuk in alia['p']:
                    chap_n, pasuk_n = pasuk['@n'].split('-')
                    chap_n = gimatria(chap_n)
                    pasuk_n = gimatria(pasuk_n[1:-1])
                    if chap_n in book_data['chapters'].keys():
                        if pasuk_n > book_data['chapters'][chap_n]:
                            book_data['chapters'][chap_n] = pasuk_n
                    else:
                        book_data['chapters'][chap_n] = pasuk_n
                    book_data['psukim'].append(pasuk['d'])
                    #there is also a problem in some aliot with psukim devision, it should be checked and printed the wrong for hand fixing
                    try:
                        if type(pasuk['t']) != list: pasuk['t'] = [pasuk['t']]
                        book_data['onkelos'].append(pasuk['t'][0]['#text'])
                    except KeyError:
                        print(chap, alia['@n'])
        book_data['weekly_chaps'].update({chap: chap_data})
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(book_data, file, indent=4, ensure_ascii=False)

פרשת יתרו שביעי
פרשת יתרו שביעי
פרשת יתרו שביעי
פרשת פינחס ראשון
פרשת ואתחנן חמישי
פרשת ואתחנן חמישי
פרשת ואתחנן חמישי
פרשת נצבים
פרשת וילך


In [13]:
problem_books = [
    'שמות ',
    'במדבר', 
    'דברים'
]
for book in problem_books:
    with open(os.path.join('tora', book) + '.json', 'r', encoding = 'utf-8') as file:
        json_represent = json.load(file)
        pasuk_acc_to_chaps = 0
        pasuk_in_psukim = 0
        pasuk_in_onkelos = 0
        for i in json_represent['chapters'].values():
            pasuk_acc_to_chaps = pasuk_acc_to_chaps + i
        for i in range(0, len(json_represent['psukim'])):
            pasuk_in_psukim = pasuk_in_psukim + 1
        for i in range(0, len(json_represent['onkelos'])):
            pasuk_in_onkelos = pasuk_in_onkelos + 1
        print(pasuk_acc_to_chaps, pasuk_in_psukim, pasuk_in_onkelos)

1210 1210 1210
1288 1288 1288
956 956 956
