In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

In [2]:
data = pd.DataFrame(columns = ['doc_name', 'qualifications', 'specialities', 'verification_label', 'practo_score', 'practo_votes', 'description', 'clinics'])

In [3]:
def clean_text(tag_text):
    return re.sub(' ,', ',', re.sub('\s+', ' ', tag_text))

In [4]:
def get_doctor_data(doc_url):
    r = requests.get(doc_url)
    data = r.content
    soup = BeautifulSoup(data, 'lxml')
    
    # Retrieving name of the doctor
    name = soup.find('h1', class_='doctor-name').text.encode('utf-8')
    
    # Retrieving doctor's qualifications
    qual = soup.find('p', class_='doctor-qualifications').text.encode('utf-8').strip()
    qualifications = clean_text(qual)
    
    # Retrieving doctor's specialities
    spec = soup.find('h2', class_='doctor-specialties').text.encode('utf-8').strip()
    specialities =  clean_text(spec)
    
    # Retrieving doctor's verification status
    try:
        verification_label = soup.find('span', class_='verification-label').text.encode('utf-8').strip()
    except:
        verification_label = 'Not Verified'
        
    # Retrieving doctor's practo trust score and number of votes
    try:
        patient_experience_score = soup.find('div', class_='patient_experience_score').text.encode('utf-8').strip()
        votes_string = soup.find('span', class_='doctor-votes').text.encode('utf-8').strip()
        votes = int(re.sub(r'\D', "",votes_string))
        score = int(re.sub(r'\D', "",patient_experience_score) .replace(str(votes),''))
    except:
        score = 'Not Scored'
        votes = 0
    
    # Retrieving doctor's practo description
    desc = soup.find('meta', attrs = {'itemprop':'description'})
    description = desc['content'].encode('utf-8')
    
    # Retrieving clinics' details
    clinics_data = soup.find_all('div', class_= 'clinic-block', recursive = True)
    clinics = {}
    for i, clinic in enumerate(clinics_data):
        clinic_data = {}
        
        clinic_data['clinic_name'] = clinic.find('div', class_ = 'clinic-address').h2.text.encode('utf-8')
        clinic_data['locality'], clinic_data['city'] = clinic.find('div', class_ = 'clinic-locality').h2.text.encode('utf-8'), clinic.find('div', class_ = 'clinic-locality').span.text.encode('utf-8')
        clinic_data['clinic_address'] = clinic.find('p', class_ = 'clinic-street-address').text.encode('utf-8').strip()
        clinic_data['clinic_url'] = clinic.find('div', class_ = 'clinic-address').h2.a['href']
        
        clinic_data['clinic_geo_coordinates'] = {}
        try:
            clinic_data['clinic_geo_coordinates']['latitude'] = float(clinic.find('p', class_ = 'map-link-container').find_all('meta')[0]['content'])
            clinic_data['clinic_geo_coordinates']['longitude'] = float(clinic.find('p', class_ = 'map-link-container').find_all('meta')[1]['content'])
        except:
            clinic_data['clinic_geo_coordinates']['latitude'] = 'NA'
            clinic_data['clinic_geo_coordinates']['longitude'] = 'NA'
        
        clinic_timings_raw = clinic.find('div', class_='clinic-timings').text.encode('utf-8')
        clinic_data[ 'clinic_timings'] = clean_text(clinic_timings_raw)
        if clinic_data[ 'clinic_timings'] ==' ':
            clinic_data[ 'clinic_timings'] = 'Not Mentioned'
        clinics[str(i)] = clinic_data
    return [name, qualifications, specialities, verification_label, score, votes, description, clinics]

In [5]:
list_filename = 'ophthalmologist.txt'
with open(list_filename) as f: 
    for url in f:
        url = url.strip()
        data.loc[-1] = get_doctor_data(url)  # adding a row
        data.index = data.index + 1  # shifting index
        data = data.sort_index()

In [6]:
data

Unnamed: 0,doc_name,qualifications,specialities,verification_label,practo_score,practo_votes,description,clinics
0,Dr. Nirav Agarwat,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Nirav Agarwat is an Ophthalmologist/ Eye S...,"{u'0': {u'city': u'Mumbai', u'locality': u'Mir..."
1,Dr. Mukul Sharma,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Mukul Sharma is an Ophthalmologist/ Eye Su...,"{u'1': {u'city': u'Mumbai', u'locality': u'Byc..."
2,Dr. Mukul Sharma,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Mukul Sharma is an Ophthalmologist/ Eye Su...,"{u'1': {u'city': u'Mumbai', u'locality': u'Byc..."
3,Dr Nirav Davda,MS - Ophthalmology,Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr Nirav Davda is an Ophthalmologist/ Eye Surg...,"{u'0': {u'city': u'Mumbai', u'locality': u'Mal..."
4,Dr. Somchand Kannji Salva,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Somchand Kannji Salva is an Ophthalmologis...,"{u'1': {u'city': u'Mumbai', u'locality': u'Dad..."
5,Dr. Vimal Fudnawala,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Vimal Fudnawala is an Ophthalmologist/ Eye...,"{u'0': {u'city': u'Mumbai', u'locality': u'Wad..."
6,Dr. Bhagat Vivian Jagdishchandra,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Bhagat Vivian Jagdishchandra is an Ophthal...,"{u'0': {u'city': u'Mumbai', u'locality': u'Bor..."
7,Dr Chinchalkar B C,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr Chinchalkar B C is an Ophthalmologist/ Eye ...,"{u'0': {u'city': u'Mumbai', u'locality': u'Mal..."
8,Dr. Somchand Kannji Salva,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Somchand Kannji Salva is an Ophthalmologis...,"{u'1': {u'city': u'Mumbai', u'locality': u'Dad..."
9,Dr. Priya Rajput,"MBBS, MS - Ophthalmology",Ophthalmologist/ Eye Surgeon,Not Verified,Not Scored,0.0,Dr. Priya Rajput is an Ophthalmologist/ Eye Su...,"{u'1': {u'city': u'Mumbai', u'locality': u'Mal..."


In [7]:
data.to_csv('ophthalmologist_info.csv')