In [1]:
import requests
from lxml import html
from bs4 import BeautifulSoup
from tqdm import tqdm
import csv
import time
import pandas as pd
import re
import datetime
import ftfy

In [2]:
# Pick some cities with reviews in English

citylist = ['New York','Boston','London','Tokyo','Montreal','Vancouver','San Francisco','Los Angeles',
            'Seattle','Chicago','Philadelphia','Houston','San Diego','Calgary','Berlin','Amsterdam',
            'Stockholm','Birmingham','Glasgow','Edinburgh','Liverpool']

now = time.strftime("%Y%m%d-%H%M%S")
csvname = 'hostelURL'+now+'.csv'

In [3]:
# Write the links into a CSV

with open(csvname, 'w', newline='', encoding="UTF-8") as f:
    writer = csv.writer(f)
    writer.writerow(['name', 'hostelid','rate','city','url'])
    
    for cityname in citylist:
        url= 'http://www.hostelworld.com/hostels/'+cityname+'?ShowAll=1'
        response = requests.get(url)
        city_soup = BeautifulSoup(response.text, 'html.parser')
        city_hotels_names  = city_soup.find_all(class_=['hwta-property','a'])

        for city_hotels_names_list in city_hotels_names:
            cname = city_hotels_names_list.get('data-name')
            cid = city_hotels_names_list.get('data-id')
            crate = city_hotels_names_list.find(class_='hwta-rating-score').text
            clink = city_hotels_names_list.get('url')
            writer.writerow([cname, cid, crate, cityname, clink])

In [4]:
df = pd.read_csv(csvname)
df.head()

Unnamed: 0,name,hostelid,rate,city,url
0,HI NYC Hostel,1850,8.7,New York,https://www.hostelworld.com/hosteldetails.php/...
1,The Local NYC,76281,9.1,New York,https://www.hostelworld.com/hosteldetails.php/...
2,Chelsea International Hostel,801,8.0,New York,https://www.hostelworld.com/hosteldetails.php/...
3,NY Moore Hostel,57176,8.9,New York,https://www.hostelworld.com/hosteldetails.php/...
4,Q4 Hotel,58186,7.8,New York,https://www.hostelworld.com/hosteldetails.php/...


In [13]:
# Prepare a new CSV for Information Storage

hostelURL = pd.read_csv('hostelURL20190526-213153.csv', header=0, encoding="utf-8")

now = datetime.datetime.today()
then = now + datetime.timedelta(-60)

import time
now = time.strftime("%Y%m%d-%H%M%S")
csvname = 'hostelRev'+now+'.csv'

In [14]:
def getHostelInfo(index):
    Name   = ftfy.fix_text(hostelURL['name'][index])
    ID     = hostelURL['hostelid'][index]
    City   = hostelURL['city'][index]
    Rating = hostelURL['rate'][index]
    Link   = hostelURL['url'][index] + '/reviews/'
    return Name, ID, City, Rating, Link

In [None]:
# Scrape and Write Information to CSV. Information retrieval may fail occasionally due to the repeated traffic from same IP
# A "try" and "sleep" has been incorporated in the code to overcome that.

with open(csvname, 'w', newline='', encoding="UTF-8") as f:
    writer = csv.writer(f)
    writer.writerow(['Hostel Name','Hostel ID','Hostel City','Overall','Review ID','Reviewer ID','Stayed','Username','Rating','Nationality','Gender','Age Group','Total Reviews','Review','Response'])

    for ind in tqdm(hostelURL.index):
        loopbreaker = 0
        HNME, HID, HCTY, HRTE, HURL = getHostelInfo(ind)
        
        # Number of pages to scrape
        numb_of_pages = 120 

        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0"}
        all_reviews = []

        for nmb in range(1,numb_of_pages):
            url = f"https://www.hostelworld.com/properties/{str(hostelid)}/reviews?sort=newest&page={nmb}&monthCount=36"
            data_raw = requests.get(url, headers=headers).json()

            try:
                for ct, val in enumerate(data_raw["reviews"]):
        
                    HRID = val['id']
                    RID  = val['reviewer']['id']
                    DTE  = val['summary']['stayed']
                    USR  = val['reviewer']['username']
                    RTE  = val['summary']['overall']
                    NAT  = val['reviewer']['nationality']
                    GDR  = val['reviewer']['gender']
                    AGE  = val['reviewer']['ageGroup']
                    TTL  = val['reviewer']['totalReviews']
                    REV  = val['notes']
                    REP  = val['ownerReply']
        
                    writer.writerow( [HNME, HID, HCTY, HRTE, HRID, RID, DTE, USR, RTE, NAT, GDR, AGE, TTL, REV, REP] )
            except requests.exceptions.SSLError:
                time.sleep(1)
            except:
                break