In [8]:
from bs4 import BeautifulSoup
from threading import Thread
import math
import pandas as pd
import queue
import requests

# Global variables and initilization
START_URL = 'https://www.tourradar.com/d/japan?'
columns = ["Tour Name", "Price (Euro)"]
result = pd.DataFrame(columns=columns)
my_queue = queue.Queue()

In [9]:
def make_urls():
    url_list = []
    # The first page uses a different post-fix in the URL than the others
    url_list.append(START_URL)
    for page in range(2, find_total_pages() + 1):
        url_list.append(START_URL + "page=" + str(page))

    return url_list

In [10]:
def find_total_pages():
    tour_sum = 0
    tour_per_page = 0
    soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
    # Find how many tours in total
    for div in soup.find_all("div", class_ = "stat"):
        tour_sum = int(div.h2.text.split()[0])
        
    # Find how many tours per page
    for div in soup.find_all("div", class_ = "list"):
        tour_per_page = len(soup.find_all("div", class_ = "bm"))
    
    # The answer is the round up value of the division
    return math.ceil(tour_sum/tour_per_page)

In [11]:
def worker(url, queue):
    
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    titles = []
    prices = []
    for div_list in soup.find_all("div", class_ = "list"):
        # Find all tour titles
        for div in soup.find_all("div", class_ = "bm"):
            titles.append(div.a.text)
            
        # Find all tour prices
        for span in soup.find_all("span", class_ = "prv"):
            # Remove thousands separator in US and European formats
            prices.append(int(span.text.replace(',', '').replace('.', '')))
    
    queue.put(list(zip(titles, prices)))

In [12]:
# Create all urls for the data retrieval
urls = make_urls()

# Run threads to start the work!
all_threads = []
for url in urls:
    t = Thread(target=worker, args=(url, my_queue))
    t.start()
    all_threads.append(t)
    
# Get data from queue and add to DataFrame
result_count = 0
while result_count < len(urls):
    data = my_queue.get()
    result = result.append(pd.DataFrame(data, columns=columns))
    result_count += 1
    
# Write the final results to a cvs file
result.to_csv('tour_to_japan.csv', index=False)