In [32]:
import requests
from pyquery import PyQuery as pq
import csv
import os
import json
import hashlib
import time
import re

In [3]:
def url_to_dict(url):
    return dict(x.split('=') for x in url.split('&'))

In [36]:
search_url=r'https://www.sapak-mushlam.co.il/mushlamprovidersearch/Handlers/SearchProviders.ashx?dt=%d'
details_url=r'https://www.sapak-mushlam.co.il/mushlamprovidersearch/Handlers/GetProviderDetails.ashx?dt=%d'
search_data='reSearch=1&SearchType=2&ResultsType=0&ProviderType=surgeon&DoctorIds=&ServiceId=S31-0&CityId=&Language=&WeekDays=&HoursFrom=&HoursTo=&Gender=&SearchFlag=0&CatAndSpec=&Profession=&OptTreatmentId=&SurgeryId=&HospitalId=&surgeonOnly=false&mProviderId=&cityIdExt=&cityName=&cityNameExt=&cityInUse=&subject=&subjectExt=&alternativeSearch=0&specialWords=&fixedWords=&Page=1&rp=10&orderBy=&AdvisorType=-2'
details_data='DocPlaceID=529&DoctorId=4182&Class=table_line1&ProviderType=surgeon&ResultsType=0&OperationId=238'
search_data=url_to_dict(search_data)
details_data=url_to_dict(details_data)

In [161]:
dt=1
lastkey=None

In [168]:
def get_with_cache(url, data):
    global lastkey
    key = (json.dumps(data, sort_keys=True)).encode('utf8')
    key = hashlib.md5(key).hexdigest()
    try:
        content =  open(os.path.join('mushlam_cache', key), encoding='utf8').read()
        lastkey = key
    except FileNotFoundError:
        headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
        resp = requests.get(url, data=data, headers=headers)
        time.sleep(3)
        content = resp.text
        open(os.path.join('mushlam_cache', key), 'w', encoding='utf8').write(content)
    return content

In [169]:
def get_details(props):
    global dt
    details_data['DocPlaceID']=props['plcId']
    details_data['DoctorId']=props['docId']
    details_data['OperationId']=props['oprtId']
    del props['plcId']
    del props['docId']
    del props['oprtId']
    details=get_with_cache(details_url % dt, details_data)
    dt += 1
    cells=pq(details).find('td')
    for cell_ in cells:
        cell=pq(cell_)
        if len(cell.children()) == 2:
            first_child = pq(cell.children()[0])
            fist_child_class=first_child.attr('class')
            if fist_child_class is not None and fist_child_class.startswith('text_blue'):
                name = first_child.text().replace(':', '').strip()
                value = pq(cell.children()[1]).text()
                if '₪' in value:
                    value = value.replace('₪', '').replace(',','').strip()
                props[name] = value
    return props
   

In [170]:
def search_page(page):
    global dt
    search_data['Page']=str(page)
    search_data['reSearch']='0' if page==1 else '1'
    search=get_with_cache(search_url % dt, search_data)
    dt += 1
    rows=pq(search).find('tr')
    got = 0
    for row_ in rows:
        row = pq(row_)
        row_cls=row.attr('class')
        if row_cls is None or 'docId:' not in row_cls:
            continue
        props = dict(x.split(':') for x in row.attr('class').split())
        cells = row.find('td')
        props['name'] = pq(cells[1]).text()
        props['operation'] = pq(cells[2]).text()
        props['hospital'] = pq(cells[3]).text()
        props['hospital_phone'] = pq(cells[4]).text()
        props['clinic_phone'] = pq(cells[5]).text()
        props = get_details(props)
        yield props
        got += 1
    if got < 10:
        yield None

In [171]:
def search_all():
    page = 1
    done = False
    while not done:
        for row in search_page(page):
            if row is not None:
                yield row
            else:
                done = True
                break
        page+=1

In [172]:
headers=['name',
         'operation',
         'hospital', 
         'גובה השתתפות עצמית פלטינום', 
         'גובה השתתפות עצמית מושלם',
         'שירות',
         'שפת דיבור',
         'שפות דיבור',
         'תפקיד',
         'מין',
         'hospital_phone',
         'clinic_phone',
         'כתובת בית החולים', 
         'מידע נוסף', 
         'yMap', 
         'xMap', 
        ]

In [173]:
while True:
    writer = csv.DictWriter(open('mushlam.csv', 'w'), headers)
    writer.writerow(dict(zip(headers, headers)))
    try:
        writer.writerows(search_all())
    except Exception as e:
        print(repr(e))
        time.sleep(60)
        continue
    break


KKK e5ab6beef963449b00f35588bce5f6b0


### This part is for comparing the clalit and harel data

In [3]:
clalit_data = list(csv.DictReader(open('mushlam.csv')))
harel_data = json.load(open('harel_response.json'))['data']['lines']

In [60]:
name_re = re.compile('[א-ת]+')
doctor_data = {}
keys = ['name']
def aggregate_doctors(src, data_):
    ids = set()
    for data in data_:
        doctor_name = data.get('name', data.get('sname'))
        for clean in ['<nobr>', 
                      '</nobr>', 
                      'ד"ר', 
                      "דר'",
                      "פר'",
                      "פרופ'",
                      "'", "׳", "-"]:
            doctor_name = doctor_name.replace(clean, '')
        while '  ' in doctor_name:
            doctor_name = doctor_name.replace('  ', ' ')
        doctor_name = doctor_name.strip()
        if len(doctor_name) == 0:
            continue
        doctor_id = ''.join(sorted(name_re.findall(doctor_name)))
        ids.add(doctor_id)
        doctor_record = doctor_data.setdefault(doctor_id, {'name': doctor_name})
        doctor_record[src] = True
        operation = data.get('operation', data.get('sexpertise'))
        t_key = src+'_treatments'
        doctor_record.setdefault(t_key, "")
        doctor_record[t_key] += operation + '\n'
        if src not in keys:
            keys.append(src)
        if t_key not in keys:
            keys.append(t_key)
    return ids

In [61]:
clalit_ids = aggregate_doctors('clalit', clalit_data)
harel_ids = aggregate_doctors('harel', harel_data)


In [62]:
print(len(list(doctor_data.keys())))
print(len(clalit_ids))
print(len(harel_ids))
print(len(clalit_ids.intersection(harel_ids)))

1109
674
651
216


In [64]:
w = csv.DictWriter(open('combined_mushlam_harel.csv' ,'w'), keys)
w.writeheader()
w.writerows(doctor_data.values())
