In [10]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import psycopg2
from bs4 import BeautifulSoup as bs
from splinter import Browser
import re

In [3]:
from webdriver_manager.chrome import ChromeDriverManager
executable_path = {'executable_path': ChromeDriverManager().install()}



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\Alyssa\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [5]:
url = "https://www.tripadvisor.com/Hotels-g34227-Fort_Lauderdale_Broward_County_Florida-Hotels.html"

browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)

html = browser.html
soup = bs(html, "html.parser")

In [6]:
property_title_class = soup.find_all(class_='property_title')
hotel_names = []
for content in property_title_class:
    hotel_name = content.text
    hotel_names.append(hotel_name)

In [7]:
hotel_names

['      B Ocean Resort Fort Lauderdale',
 '      Ocean Manor Beach Resort Hotel',
 '      The Westin Fort Lauderdale',
 '      Riverside Hotel',
 '      Ocean Sky Hotel & Resort',
 '      GALLERYone - A DoubleTree Suites by Hilton Hotel',
 '      Snooze',
 '      Hotel Maren Fort Lauderdale Beach',
 '      Hyatt Centric Las Olas Fort Lauderdale',
 '      Embassy Suites by Hilton Fort Lauderdale 17th Street',
 '      The Westin Fort Lauderdale Beach Resort',
 '      Cambria Hotel Fort Lauderdale Beach',
 '      Hyatt Place Fort Lauderdale Cruise Port',
 '      Courtyard By Marriott Fort Lauderdale Beach',
 '      Best Western Plus Fort Lauderdale Airport/Cruise Port',
 '      Bahia Mar Fort Lauderdale Beach - a DoubleTree by Hilton Hotel',
 '      Renaissance Fort Lauderdale Cruise Port Hotel',
 '      Deco Boutique Hotel',
 '      Sun Tower Hotel & Suites',
 '      The Dalmar, Fort Lauderdale, a Tribute Portfolio Hotel',
 '      Best Western Plus Oceanside Inn',
 '      Ocean Beach Clu

In [12]:
all_website_prices = soup.find_all("div",  {"class": ["priceBlock ui_column is-12-tablet", "price __resizeWatch"]})
    
website_price_results = []
split_list = []
popped_list = []
price_with_website = []
price_vals = []
    
for prices in all_website_prices:
    if len(prices.text) >= 5:
        website_price_results.append(prices.text)
    
for items in website_price_results:
    split_list.append(items.split("$"))
    
for items in split_list:
    items.pop(0)
    popped_list.append(items)
    
for items in popped_list:
    if len(items) == 1:
        price_with_website.append(items[0])
    else:
        price_with_website.append(items[1])
    
for items in price_with_website:
    result = re.sub('[^0-9]','', items)
    int_results = int(result)
    price_vals.append(int_results)
rating_links = soup.find_all("a",  {"class":'ui_bubble_rating'})
hotel_ratings = []
    
for link in rating_links:
    alt_text = link.get('alt')
    split_text = alt_text.split(" ")
    rating = float(split_text[0])
    hotel_ratings.append(rating)
    
#return(hotel_names, price_vals, hotel_ratings)

In [17]:
scrape_df = pd.DataFrame({'hotel_name':hotel_names, 'price':price_vals, 'ratings':hotel_ratings})
scrape_df

Unnamed: 0,hotel_name,price,ratings
0,B Ocean Resort Fort Lauderdale,242,4.5
1,Ocean Manor Beach Resort Hotel,174,4.0
2,The Westin Fort Lauderdale,135,4.0
3,Riverside Hotel,149,4.0
4,Ocean Sky Hotel & Resort,239,3.5
5,GALLERYone - A DoubleTree Suites by Hilt...,144,4.0
6,Snooze,160,4.5
7,Hotel Maren Fort Lauderdale Beach,298,5.0
8,Hyatt Centric Las Olas Fort Lauderdale,203,4.5
9,Embassy Suites by Hilton Fort Lauderdale...,189,4.0


In [19]:
average_price_df = scrape_df.groupby('ratings').aggregate({'price': 'mean'}).reset_index()
average_price_df = average_price_df.rename(columns={'price':'average_price'})
average_price_df

Unnamed: 0,ratings,average_price
0,3.0,89.0
1,3.5,215.0
2,4.0,167.375
3,4.5,209.1
4,5.0,298.0


In [20]:
merged_df = scrape_df.merge(average_price_df)
merged_df

Unnamed: 0,hotel_name,price,ratings,average_price
0,B Ocean Resort Fort Lauderdale,242,4.5,209.1
1,Snooze,160,4.5,209.1
2,Hyatt Centric Las Olas Fort Lauderdale,203,4.5,209.1
3,The Westin Fort Lauderdale Beach Resort,246,4.5,209.1
4,Cambria Hotel Fort Lauderdale Beach,199,4.5,209.1
5,Sun Tower Hotel & Suites,269,4.5,209.1
6,Hotel Motel Lauderdale Inn,109,4.5,209.1
7,Hampton Inn Fort Lauderdale Airport Nort...,139,4.5,209.1
8,Lago Mar Beach Resort & Club,215,4.5,209.1
9,Hilton Fort Lauderdale Beach Resort,309,4.5,209.1


In [21]:
#connect to local database
rds_connection_string = "postgres:postgres@localhost:5432/hotels_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [22]:
#check table names
engine.table_names()

['csv', 'scrape']

In [23]:
#load dataframe into database
merged_df.to_sql(name='scrape', con=engine, if_exists='append', index=False)

In [24]:
#confirm that dataframe has been properly loaded by querying table
pd.read_sql_query('select * from scrape', con=engine).head()

Unnamed: 0,id,hotel_name,price,ratings,average_price
0,1,B Ocean Resort Fort Lauderdale,242,4.5,209.1
1,2,Snooze,160,4.5,209.1
2,3,Hyatt Centric Las Olas Fort Lauderdale,203,4.5,209.1
3,4,The Westin Fort Lauderdale Beach Resort,246,4.5,209.1
4,5,Cambria Hotel Fort Lauderdale Beach,199,4.5,209.1
