![bse_logo_textminingcourse](https://bse.eu/sites/default/files/bse_logo_small.png)

# Text Mining: Models and Algorithms

## Problem Set 1

In [1]:
import json
import pandas as pd
import numpy as np
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from concurrent.futures import ThreadPoolExecutor
import requests
import os
import warnings

# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")


# Go get geckodriver from : https://github.com/mozilla/geckodriver/releases

### Utils

In [2]:
def ffx_preferences(dfolder, download=False):
    '''
    Sets the preferences of the firefox browser: download path.
    '''
    profile = webdriver.FirefoxProfile()
    # set download folder:
    profile.set_preference("browser.download.dir", dfolder)
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           "application/msword,application/rtf, application/csv,text/csv,image/png ,image/jpeg, application/pdf, text/html,text/plain,application/octet-stream")
    
    # profile.add_extension('/Users/luisignaciomenendezgarcia/Dropbox/CLASSES/class_bse_text_mining/class_scraping_bse/booking/booking/ublock_origin-1.55.0.xpi')


    # this allows to download pdfs automatically
    if download:
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf")
        profile.set_preference("pdfjs.disabled", True)

    options = Options()
    options.profile = profile
    return options


def start_up(link, dfolder, geko_path,donwload=True):
    # geko_path='/Users/luisignaciomenendezgarcia/Dropbox/CLASSES/class_bse_text_mining/class_scraping_bse/booking/geckodriver'
    # download_path='./downloads'
    os.makedirs(dfolder, exist_ok=True)

    options = ffx_preferences(dfolder,donwload)
    service = Service(geko_path)
    browser = webdriver.Firefox(service=service, options=options)
    # Enter the website address here
    browser.get(link)
    time.sleep(5)  # Adjust sleep time as needed
    return browser
        
def check_and_click(browser, xpath, type):
    '''
    Function that checks whether the object is clickable and, if so, clicks on
    it. If not, waits one second and tries again.
    '''
    start_time = time.time()  # Record the start time
    while True:
        try:
            element = browser.find_element(By.XPATH, xpath)
            element.click()
            return "Clicked!"  # Element found and clicked successfully
        except NoSuchElementException:
            pass  # Continue if element not found
        except Exception as e:
            print(f"An error occurred: {e}")
            return False  # Other unexpected errors

        time.sleep(1)
        elapsed_time = time.time() - start_time
        if elapsed_time >= 3:
            # print("** The element was not found in the page. **")
            return None  # Element not found after 5 seconds
        
def check_obscures(browser, xpath, type):
    '''
    Function that checks whether the object is being "obscured" by any element so
    that it is not clickable. Important: if True, the object is going to be clicked!
    '''
    try:
        if type == "xpath":
            browser.find_element('xpath', xpath).click()
        elif type == "id":
            browser.find_element('id', xpath).click()
        elif type == "css":
            browser.find_element('css selector', xpath).click()
        elif type == "class":
            browser.find_element('class name', xpath).click()
        elif type == "link":
            browser.find_element('link text', xpath).click()
    except (ElementClickInterceptedException, StaleElementReferenceException) as e:
        print(e)
        return False
    except NoSuchElementException:
        # Do nothing if NoSuchElementException occurs (suppress the error)
        pass
    return True

def element_exists(browser, path):
    try:
        browser.find_element('xpath', path)
        return True
    except NoSuchElementException:
        return False

### Scraping Class

In [5]:
class Scrape:
    def __init__(self):
        # print("Initializing the browser...")
        # time.sleep(1)
        print("Remember to close the annoying Google popup on the page")
        dfolder='./downloads'
        geko_path='./geckodriver'
        link='https://www.booking.com/index.es.html'
        self.browser =start_up(dfolder=dfolder,link=link,geko_path=geko_path)
        self.search_bar_xpath = '//div[@class="b9b84f4305"]'
        self.search_x_path= '/html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span'
        self.search_x_path2 = '/html/body/div[4]/div/div[2]/div/div[1]/div/form/div[1]/div[4]/button'
        self.date_button_css = 'button.ebbedaf8ac:nth-child(2) > span:nth-child(1)'
        self.number_of_people_xpath = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/button'
        self.search_button_xpath = '/html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span'
        self.x_path_prev_date = '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c c9804790f7"]'
        self.x_path_next_date = '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c f073249358"]'
        self.x_path_month = '//h3[@class="e1eebb6a1e ee7ec6b631"]'
        x_path_cookies = '//button[@id="onetrust-accept-btn-handler"]'
        self.people_path = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/button'
        self.pages = 1
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
        self.data = pd.DataFrame(columns=['Hotels', 'Ratings', 'Price', 'Link'])
        self.place = ''
        check_and_click(self.browser, x_path_cookies, 'xpath')
    def input_place(self):
        place = input('Where do you want to go?')
        self.place = (place.lower()).capitalize()
        self.browser.find_element(by='xpath', value='//div[@class="b9b84f4305"]').click()
        search = self.browser.find_element(by='xpath', value='//*[@id=":re:"]')
        search.clear()
        search.send_keys(place)
        print(f'Place of stay: {(place.lower()).capitalize()}')
        x_path_close = '/html/body/div[4]/div/div[2]/div/div[1]/div/form/div[1]/div[1]/div/div/div[1]/div/div/div[1]/span/svg'
        check_and_click(self.browser, x_path_close, 'xpath')
    def input_dates(self):
        print("Just a second...")
        self.browser.find_element('css selector',self.date_button_css).click()
        while element_exists(self.browser, self.x_path_prev_date):
                self.browser.find_element('xpath', self.x_path_prev_date).click()
                time.sleep(1)
        # Input the wanted year for the stay
        start_date = (input("Input the start date of your programmed stay in the form (XX mes XXXX). Use Spanish month names. ¡Cuidado con la ortografía!")).lower()
        end_date = (input("Input the end date of your programmed stay in the form (XX mes XXXX). Use Spanish month names. ¡Cuidado con la ortografía!")).lower()
        # Retrieve the current date
        month_and_year_start = start_date[3:]
        month_and_year_end = end_date[3:]
        month_and_year = self.browser.find_element('xpath', self.x_path_month).text
        while month_and_year != month_and_year_start:
                self.browser.find_element('xpath', self.x_path_next_date).click()
                month_and_year = self.browser.find_element('xpath', self.x_path_month).text
                time.sleep(1)
        months_dict = {'enero': '01', 'febrero': '02', 'marzo': '03', 'abril': '04', 'mayo': '05', 'junio': '06', 'julio': '07', 'agosto': '08', 'septiembre': '09', 'octubre': '10', 'noviembre': '11', 'diciembre': '12'}
        x_path_dates='//div[@id="calendar-searchboxdatepicker"]//table[@class="eb03f3f27f"]//tbody//td[@class="b80d5adb18"]//span[@class="cf06f772fa"]'
        dates = self.browser.find_elements('xpath',x_path_dates)
        from_day = start_date[:2]
        to_day = end_date[:2]
        month_start = month_and_year_start[:-5]
        month_end = month_and_year_end[:-5]
        year = month_and_year[-4:]
        for date in dates:
            if date.get_attribute("data-date") == f"{year}-{months_dict[month_start]}-{from_day}":
                date.click()
                break
        if month_start == month_end:
            for date in dates:
                if date.get_attribute("data-date") == f"{year}-{months_dict[month_end]}-{to_day}":
                    date.click()
                    break
        else:
            while self.browser.find_element('xpath', self.x_path_month).text != month_and_year_end:
                self.browser.find_element('xpath', self.x_path_next_date).click()
            dates = self.browser.find_elements('xpath',x_path_dates)
            for date in dates:
                if date.get_attribute("data-date") == f"{year}-{months_dict[month_end]}-{to_day}":
                    date.click()
                    break
        self.browser.find_element('css selector',self.date_button_css).click()
        print(f'Start date of the stay: {start_date}')
        print(f'End date of the stay: {end_date}')
    def input_people(self):
        self.browser.find_element('xpath', self.people_path).click()
        number_of_people = int(input('How many people in total are you looking an accomodation for?'))
        css_minus = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/div/div/div/div[1]/div[2]/button[1]'
        css_plus = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/div/div/div/div[1]/div[2]/button[2]'
        if number_of_people == 1:
            self.browser.find_element('xpath', css_minus).click()
        elif number_of_people > 2:
            i = 2
            while i < number_of_people:
                self.browser.find_element('xpath', css_plus).click()
                i+=1
                time.sleep(2)
        self.browser.find_element('xpath', self.people_path).click()
        print(f'Accomodations for: {number_of_people} people')
    def search(self):
        check_and_click(self.browser, self.search_x_path, 'xpath')
        check_and_click(self.browser, self.search_x_path2, 'xpath')
        place = self.place
        print(f'Searching accomodations in {(place.lower()).capitalize()}...')
        # self.browser.find_element('xpath', self.search_x_path).click()
    def get_pages(self, limit=None):
        a = self.browser.find_elements('xpath', '//button[@class="a83ed08757 a2028338ea"]')
        if a:
            total_pages = int(a[-1].text)
            if limit is not None and total_pages > limit:
                self.pages = limit
            else:
                self.pages = total_pages
        else:
            self.pages = 1
    def scrape_info(self):
        print("Scraping general info...\n")
        # Finding the button to change the page in Booking.com
        change_page_xpath = '/html/body/div[4]/div/div[2]/div/div[2]/div[3]/div[2]/div[2]/div[4]/div[2]/nav/nav/div/div[3]/button/span/span'
        css = 'div.b16a89683f:nth-child(3) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1)'
        # Make sure to be on the first page when starting to scrape the data
        first_page_xpath='/html/body/div[4]/div/div[2]/div/div[2]/div[3]/div[2]/div[2]/div[4]/div[2]/nav/nav/div/div[2]/ol/li[1]/button'
        check_and_click(self.browser,first_page_xpath , type='xpath')
        # loop to scrape the data and populate the DataFrame
        for i in range(self.pages):
            print(f'Page: {i + 1}')
            # Dividing the page in the Container Objects, one for every hotel
            containers = self.browser.find_elements('xpath', '//div[@class="c066246e13"]')
            for hotel in containers:
                hotel_name = hotel.find_element('xpath', './/div[@class="f6431b446c a15b38c233"]').text
                try:
                    hotel_rating = hotel.find_element('xpath', './/div[@class="a3b8729ab1 d86cee9b25"]').text
                except:
                    hotel_rating = np.nan
                try:
                    hotel_price = hotel.find_element('xpath', './/span[@class="f6431b446c fbfd7c1165 e84eb96b1f"]').text
                except:
                    hotel_price = np.nan
                try:
                    url = hotel.find_element('xpath', './/a[@href]')
                    hotel_url= url.get_attribute('href')
                except:
                    hotel_url = np.nan
                new_row = {'Hotels': hotel_name, 'Ratings': hotel_rating, 'Price':hotel_price, 'Link': hotel_url}
                # new_row = {'Hotels': hotel_name, 'Ratings': hotel_rating, 'Price':hotel_price, 'Link': hotel_url, 'Descriptions': hotel_description}
                self.data = pd.concat([self.data, pd.DataFrame([new_row])], ignore_index=True)
            # Change page with CSS Selector
            next = self.browser.find_element('css selector', css)
            time.sleep(2)
        print("\nDone!\n")
        # display(self.data)
    def scrape_description(self,url):
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status() 
            # time.sleep(0.5)
        except requests.exceptions.RequestException as e:
            print(f"Error processing {url}: {e}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        description_tag = soup.find('p', class_='a53cbfa6de b3efd73f69')

        if description_tag:
            return description_tag.get_text(strip=True)
        else:
            print(f"Description tag not found on the page: {url}")
            return None
    def get_descriptions(self):
        print("Scraping descriptions...")
        # Set the number of concurrent threads (adjust this based on the processing power of your computer)
        num_threads = 16
        # Create a ThreadPoolExecutor to run operations in parallel
        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Use executor.map to apply the scrape_description function to each URL in parallel
            descriptions = []
            for i, description in enumerate(executor.map(self.scrape_description, self.data['Link']), start=1):
                descriptions.append(description)
                # Print every 50 link to check the progess of the scraping
                if i % 50 == 0:
                    print(f"Scraped {i} links")
        # Assign the descriptions to the 'Descriptions' column in the DataFrame
        self.data['Descriptions'] = descriptions
        # Print count after all threads have completed
        print(f"Scraped {len(descriptions)} links in total")
        display(self.data)
        print("\nDone!\n")
        return (self.data)


            
        
            

### Pipeline


#### Searching

In [6]:
instance1 = Scrape()
instance1.input_place()
instance1.input_dates()
instance1.input_people()
instance1.search()


Remember to close the annoying Google popup on the page
Place of stay: Roma
Just a second...
Start date of the stay: 12 junio 2024
End date of the stay: 02 julio 2024
Accomodations for: 4 people
Searching accomodations in Roma...


#### Scraping general Information and Description text

In [7]:
instance1.get_pages(limit = 2)
instance1.scrape_info()
data = instance1.get_descriptions()

Scraping general info...

Page: 1
Page: 2

Done!

Scraping descriptions...
Scraped 26 links in total


Unnamed: 0,Hotels,Ratings,Price,Link,Descriptions
0,Pamphili212,,€ 2.608,https://www.booking.com/hotel/it/pamphili212.e...,"Pamphili212 está en Roma, a 17 min a pie de Es..."
1,Sonder Piazza San Pietro,82.0,€ 8.376,https://www.booking.com/hotel/it/vaticano-pris...,Sonder Piazza San Pietro ofrece alojamiento co...
2,"[Centocelle, Metro C] Luminoso quadrilocale",,€ 3.646,https://www.booking.com/hotel/it/centocelle-me...,"[Centocelle, Metro C] Luminoso quadrilocale se..."
3,hu Roma Camping In Town,78.0,€ 3.200,https://www.booking.com/hotel/it/camping-villa...,"El hu Roma Camping In Town, que se encuentra a..."
4,Palma Residence,75.0,€ 2.848,https://www.booking.com/hotel/it/palma-residen...,El Palma Residence se encuentra a solo 5 minut...
5,Rome Marriott Park Hotel,75.0,€ 4.108,https://www.booking.com/hotel/it/rome-park.es....,El Rome Marriott Park Hotel ofrece habitacione...
6,Borgo Pio 91,83.0,€ 5.009,https://www.booking.com/hotel/it/borgo-pio-91....,El Borgo Pio 91 se encuentra a pocos pasos de ...
7,Z-ArchSuites,83.0,€ 7.360,https://www.booking.com/hotel/it/fz-archstyle-...,Z-ArchSuites está muy bien situado en el barri...
8,CasaBaleno - Luxury apartment in Rome San Giov...,,€ 3.626,https://www.booking.com/hotel/it/casabaleno-lu...,CasaBaleno - Luxury apartment in Rome San Giov...
9,Alis Laura,88.0,€ 4.248,https://www.booking.com/hotel/it/jadore-monic....,"El Alis Laura está situado en Roma, a solo 200..."



Done!



## Step by Step Notebook

### Opening the Browser

In [35]:
dfolder='./downloads'
geko_path='./geckodriver'
link='https://www.booking.com/index.es.html'


browser=start_up(dfolder=dfolder,link=link,geko_path=geko_path)


### Accepting Cookies

In [36]:
# click on "Accept cookies"
x_path_cookies = '//button[@id="onetrust-accept-btn-handler"]'
check_and_click(browser, x_path_cookies, 'xpath')


'Clicked!'

### Search Bar

In [37]:
browser.find_element(by='xpath',value='//div[@class="b9b84f4305"]').click()

### Input the place

In [38]:
place = input('Where do you want to go?')
search1 = browser.find_element(by='xpath',value='//*[@id=":re:"]')
search1.send_keys(place)

### Input the Dates

In [39]:
css_date='button.ebbedaf8ac:nth-child(2) > span:nth-child(1)'

browser.find_element('css selector',css_date).click()


In [40]:
x_path_prev_date = '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c c9804790f7"]'
while element_exists(browser, x_path_prev_date):
        browser.find_element('xpath', x_path_prev_date).click()
        time.sleep(1)
x_path_month1 = '//h3[@class="e1eebb6a1e ee7ec6b631"]'
# Input the wanted year for the stay
start_date = (input("Input the start date of your programmed stay in the form (XX mes XXXX). Use Spanish month names. ¡Cuidado con la ortografía!")).lower()
end_date = (input("Input the end date of your programmed stay in the form (XX mes XXXX). Use Spanish month names. ¡Cuidado con la ortografía!")).lower()
# Retrieve the current date
month_and_year_start = start_date[3:]
month_and_year_end = end_date[3:]
month_and_year = browser.find_element('xpath', x_path_month1).text
x_path_next_date = '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 deab83296e f4552b6561 dc72a8413c f073249358"]'
while month_and_year != month_and_year_start:
        browser.find_element('xpath', x_path_next_date).click()
        month_and_year = browser.find_element('xpath', x_path_month1).text
        time.sleep(1)
print(month_and_year)




junio 2024


### Select the dates

In [42]:
months_dict = {'enero': '01', 'febrero': '02', 'marzo': '03', 'abril': '04', 'mayo': '05', 'junio': '06', 'julio': '07', 'agosto': '08', 'septiembre': '09', 'octubre': '10', 'noviembre': '11', 'diciembre': '12'}
x_path_dates='//div[@id="calendar-searchboxdatepicker"]//table[@class="eb03f3f27f"]//tbody//td[@class="b80d5adb18"]//span[@class="cf06f772fa"]'
dates = browser.find_elements('xpath',x_path_dates)
from_day = start_date[:2]
to_day = end_date[:2]
month_start = month_and_year_start[:-5]
month_end = month_and_year_end[:-5]
year = month_and_year[-4:]
for date in dates:
    if date.get_attribute("data-date") == f"{year}-{months_dict[month_start]}-{from_day}":
        date.click()
        break
if month_start == month_end:
    for date in dates:
        if date.get_attribute("data-date") == f"{year}-{months_dict[month_end]}-{to_day}":
            date.click()
            break
else:
    while browser.find_element('xpath', x_path_month1).text != month_and_year_end:
        browser.find_element('xpath', x_path_next_date).click()
    dates = browser.find_elements('xpath',x_path_dates)
    for date in dates:
        if date.get_attribute("data-date") == f"{year}-{months_dict[month_end]}-{to_day}":
            date.click()
            break
browser.find_element('css selector',css_date).click()


### Input the number of people

In [45]:
x_path = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/button'

browser.find_element('xpath', x_path).click()

In [46]:
number_of_people = int(input('How many people in total are you looking an accomodation for?'))

css_minus = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/div/div/div/div[1]/div[2]/button[1]'
css_plus = '/html/body/div[3]/div[2]/div/form/div[1]/div[3]/div/div/div/div/div[1]/div[2]/button[2]'
if number_of_people == 1:
    browser.find_element('xpath', css_minus).click()
elif number_of_people > 2:
    i = 2
    while i < number_of_people:
        browser.find_element('xpath', css_plus).click()
        i+=1
        time.sleep(2)
    

### Search

In [47]:
search_xpath='/html/body/div[3]/div[2]/div/form/div[1]/div[4]/button/span'

check_obscures(browser,search_xpath , type='xpath')
check_and_click(browser,search_xpath , type='xpath')


### Extracting Number of Pages

In [48]:

def get_number_pages(browser):
    '''
    Get the number of pages. 
    '''
    a = browser.find_elements('xpath',
        '//button[@class="a83ed08757 a2028338ea"]')
    if a:
        return(int(a[-1].text))
    else:
        return (1)

pages = get_number_pages(browser)

print(pages)


33


### Scraping Pipeline

In [49]:
# pages = 5
# Finding the button to change the page in Booking.com
change_page_xpath = '/html/body/div[4]/div/div[2]/div/div[2]/div[3]/div[2]/div[2]/div[4]/div[2]/nav/nav/div/div[3]/button/span/span'
css = 'div.b16a89683f:nth-child(3) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1)'
# Creating DataFrame
data = pd.DataFrame(columns=['Hotels', 'Ratings', 'Price', 'Link'])
# Make sure to be on the first page when starting to scrape the data
first_page_xpath='/html/body/div[4]/div/div[2]/div/div[2]/div[3]/div[2]/div[2]/div[4]/div[2]/nav/nav/div/div[2]/ol/li[1]/button'
check_and_click(browser,first_page_xpath , type='xpath')
# loop to scrape the data and populate the DataFrame
for i in range(pages):
    print(f'Page: {i + 1}')
    # Dividing the page in the Container Objects, one for every hotel
    containers = browser.find_elements('xpath', '//div[@class="c066246e13"]')
    for hotel in containers:
        hotel_name = hotel.find_element('xpath', './/div[@class="f6431b446c a15b38c233"]').text
        try:
            hotel_rating = hotel.find_element('xpath', './/div[@class="a3b8729ab1 d86cee9b25"]').text
        except:
            hotel_rating = np.nan
        try:
            hotel_price = hotel.find_element('xpath', './/span[@class="f6431b446c fbfd7c1165 e84eb96b1f"]').text
        except:
            hotel_price = np.nan
        try:
            url = hotel.find_element('xpath', './/a[@href]')
            hotel_url= url.get_attribute('href')
        except:
            hotel_url = np.nan
        new_row = {'Hotels': hotel_name, 'Ratings': hotel_rating, 'Price':hotel_price, 'Link': hotel_url}
        data = pd.concat([data, pd.DataFrame([new_row])], ignore_index=True)
    # Change page with CSS Selector
    next = browser.find_element('css selector', css)
    time.sleep(2)
print("\nDone!\n")
display(data)

Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33

Done!



Unnamed: 0,Hotels,Ratings,Price,Link
0,Hostal Rivera - Atocha,79,€ 150,https://www.booking.com/hotel/es/hostal-rivera...
1,flor hostel capsules,65,€ 86,https://www.booking.com/hotel/es/flor-hostel-c...
2,Hostal PETITE MAMAN,78,€ 244,https://www.booking.com/hotel/es/hostal-atelie...
3,New Dream Madrid,80,€ 263,https://www.booking.com/hotel/es/new-dream-mad...
4,Casa de Huespedes Dolce Vita,77,€ 182,https://www.booking.com/hotel/es/hostal-dolce-...
...,...,...,...,...
820,Room Plaza España,87,€ 260,https://www.booking.com/hotel/es/room-plaza-es...
821,Hostal La Casa de La Plaza,72,€ 247,https://www.booking.com/hotel/es/hostal-la-cas...
822,Hostal Regio,80,€ 246,https://www.booking.com/hotel/es/hostal-regio....
823,Far Home Atocha,85,€ 240,https://www.booking.com/hotel/es/no-name-city-...


### Scraping Descriptions using BeutifulSoup with parrallelized operations

In [50]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import time

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

# Function to scrape the descriptions using Beautiful Soup
def scrape_description(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status() 
        # time.sleep(0.5)
    except requests.exceptions.RequestException as e:
        print(f"Error processing {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    description_tag = soup.find('p', class_='a53cbfa6de b3efd73f69')

    if description_tag:
        return description_tag.get_text(strip=True)
    else:
        print(f"Description tag not found on the page: {url}")
        return None

# Set the number of concurrent threads (adjust this based on the processing power of your computer)
num_threads = 16

# Create a ThreadPoolExecutor to run operations in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Use executor.map to apply the scrape_description function to each URL in parallel
    descriptions = []
    for i, description in enumerate(executor.map(scrape_description, data['Link']), start=1):
        descriptions.append(description)
        # Print every 50 link to check the progess of the scraping
        if i % 50 == 0:
            print(f"Scraped {i} links")

# Assign the descriptions to the 'Descriptions' column in the DataFrame
data['Descriptions'] = descriptions

# Print count after all threads have completed
print(f"Scraped {len(descriptions)} links")
print("\nDone!\n")


Scraped 50 links
Scraped 100 links
Scraped 150 links
Scraped 200 links
Scraped 250 links
Scraped 300 links
Scraped 350 links
Scraped 400 links
Scraped 450 links
Scraped 500 links
Scraped 550 links
Scraped 600 links
Scraped 650 links
Scraped 700 links
Scraped 750 links
Scraped 800 links
Scraped 825 links


In [51]:
data

Unnamed: 0,Hotels,Ratings,Price,Link,Descriptions
0,Hostal Rivera - Atocha,79,€ 150,https://www.booking.com/hotel/es/hostal-rivera...,El Hostal Rivera está ubicado en una zona cént...
1,flor hostel capsules,65,€ 86,https://www.booking.com/hotel/es/flor-hostel-c...,Flor hostel capsules está muy bien situado en ...
2,Hostal PETITE MAMAN,78,€ 244,https://www.booking.com/hotel/es/hostal-atelie...,El Hostal PETITE MAMAN está en el centro de Ma...
3,New Dream Madrid,80,€ 263,https://www.booking.com/hotel/es/new-dream-mad...,New Dream Madrid se encuentra en el centro de ...
4,Casa de Huespedes Dolce Vita,77,€ 182,https://www.booking.com/hotel/es/hostal-dolce-...,El establecimiento Casa de Huespedes Dolce Vit...
...,...,...,...,...,...
820,Room Plaza España,87,€ 260,https://www.booking.com/hotel/es/room-plaza-es...,"Room Plaza España está en el centro de Madrid,..."
821,Hostal La Casa de La Plaza,72,€ 247,https://www.booking.com/hotel/es/hostal-la-cas...,Este establecimiento ofrece una buena relación...
822,Hostal Regio,80,€ 246,https://www.booking.com/hotel/es/hostal-regio....,El Hostal Regio se encuentra en una calle tran...
823,Far Home Atocha,85,€ 240,https://www.booking.com/hotel/es/no-name-city-...,El Far Home Atocha está situado en el centro d...


### Order by Price

In [28]:
# # Clean and convert the 'Price' column to numeric values
# data['Price'] = pd.to_numeric(data['Price'].str.replace('€', '').str.replace('.', ''), errors='coerce')

# # Sort the DataFrame by the 'Price' column in ascending order
# data_sorted = data.sort_values(by='Price', ascending=False)
# display(data_sorted)

### Create CSV Files

In [21]:
# csv_name = 'Madrid.csv'
# csv = data.to_csv(f'.\{csv_name}', sep=',')
# # csv = data.to_csv('Madrid', sep=',')