## This code is for Data Scraping of 'Tourist Destinations Worldwide' using multiple sources like Wikipedia, Google Search,Crowd sourcing and TripAdvisor website.

### Group 14 : Radha Kamalapurkar - 12110009, Abhishek Maji - 12110101, Vini Chhajed - 12110080, Kumar Lalwani - 12110002

#### Initialize Libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import re
import numpy as np
import time
import re

#### Scrape data(Destination_Name, City and Visitors_per_Year) from Wikipedia table using BeautifulSoup

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_most_visited_palaces_and_monuments'
table_id = "wikitable sortable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
lst_most_visited_dest = soup.find('table', attrs={"class":table_id})
lst = pd.read_html(str(lst_most_visited_dest))

In [3]:
df=pd.DataFrame(lst[0])

In [4]:
df = df.xs('National monuments by visitors per year', axis=1, drop_level=True)

In [5]:
df1=df[['Name', 'Country flag, city', 'Visitors per year']]

#### Create required dataframe from the scraped Wikipedia table

In [6]:
df1.rename(columns = {'Name':'Destination_Name','Country flag, city':'City', 'Visitors per year':'Visitors_per_year'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [7]:
df1.reset_index(drop=True)

Unnamed: 0,Destination_Name,City,Visitors_per_year
0,Forbidden City,Beijing,"17,000,000+"
1,St. Peter's Basilica,Vatican City,11000000
2,Palace of Versailles,Versailles,8100000
3,Lincoln Memorial,"Washington, D.C.",7804683
4,Colosseum-Forum-Palatine Hill Circuit,Rome,7650519
5,Parthenon,Athens,7200000
6,Taj Mahal,Agra,7090207
7,Eiffel Tower,Paris,7000000
8,Cologne Cathedral,Cologne,6000000
9,Peterhof Palace,Saint Petersburg,5245900


#### Select random 14 Tourist Destinations from the DataFrame created above. We will be collecting data for these selected Tourist Destinations

In [8]:
filter_val=['Tower of London','Royal Palace of Madrid','Edinburgh Castle','Topkapı Palace','Forbidden City','Kazan Kremlin','Parthenon','Eiffel Tower','Cologne Cathedral','Sagrada Família','Machu Picchu','Alhambra','Moscow Kremlin','Chapultepec Castle']
df2=df1[df1.Destination_Name.isin(filter_val)]

#### Assign Destination_Id to each of the selected Tourist Destinations

In [9]:
data = {'Destination_Id': ['g186338-d187547','g187514-d190146','g186525-d187653','g293974-d294547','g294212-d319086','g298520-d321110','g189400-d198711','g187147-d188151','g187371-d192291','g187497-d190166','g294318-d668949','g187441-d191078','g298484-d300392','g150800-d183986'], 
        'Destination_Name': ['Tower of London','Royal Palace of Madrid','Edinburgh Castle','Topkapı Palace','Forbidden City','Kazan Kremlin','Parthenon','Eiffel Tower','Cologne Cathedral','Sagrada Família','Machu Picchu','Alhambra','Moscow Kremlin','Chapultepec Castle']
        }

destination_id_df = pd.DataFrame(data, columns= ['Destination_Id', 'Destination_Name'])

In [10]:
destination_id_df

Unnamed: 0,Destination_Id,Destination_Name
0,g186338-d187547,Tower of London
1,g187514-d190146,Royal Palace of Madrid
2,g186525-d187653,Edinburgh Castle
3,g293974-d294547,Topkapı Palace
4,g294212-d319086,Forbidden City
5,g298520-d321110,Kazan Kremlin
6,g189400-d198711,Parthenon
7,g187147-d188151,Eiffel Tower
8,g187371-d192291,Cologne Cathedral
9,g187497-d190166,Sagrada Família


#### Create another Data Frame with City and City_Id for each of the Tourist Destinations

In [11]:
data = {'City_Id': ['g186338','g187514','g186525','g293974','g294212','g298520','g189400','g187147','g187371','g187497','g294318','g187441','g298484','g150800'], 
        'City': ['London','Madrid','Edinburgh','Istanbul','Beijing','Kazan','Athens','Paris','Cologne','Barcelona','Cusco','Granada','Moscow','Mexico City']
        }

city_id = pd.DataFrame(data, columns= ['City_Id', 'City'])

#### Scrape data from Google Search to extract the temperature of each of the selected cities

In [12]:
city = ['London','Madrid','Edinburgh','Istanbul','Beijing','Kazan','Athens','Paris','Cologne','Barcelona','Cusco','Granada','Moscow','Mexico City']


avg_temp=[]
city_df=[]
  
for i in city:
    url = "https://google.com/search?q=weather+in+" + i
    request_result = requests.get( url )
    soup5 = BeautifulSoup( request_result.text , "html.parser" )
    temperature = soup5.find( "div" , class_='BNeawe' ).text 
    avg_temp.append(temperature)
    city_df.append(i)


#### Create a Data Frame with City and its Average Temperature

In [13]:
import pandas as pd
df9 = {'City':city_df,'City_Average_Temperature':avg_temp}
avg_temp = pd.DataFrame.from_dict(df9)

In [14]:
city_id

Unnamed: 0,City_Id,City
0,g186338,London
1,g187514,Madrid
2,g186525,Edinburgh
3,g293974,Istanbul
4,g294212,Beijing
5,g298520,Kazan
6,g189400,Athens
7,g187147,Paris
8,g187371,Cologne
9,g187497,Barcelona


In [15]:
city_id_df=pd.merge(city_id, avg_temp, on=["City"])

In [16]:
city_id_df

Unnamed: 0,City_Id,City,City_Average_Temperature
0,g186338,London,13°C
1,g187514,Madrid,18°C
2,g186525,Edinburgh,11°C
3,g293974,Istanbul,19°C
4,g294212,Beijing,9°C
5,g298520,Kazan,12°C
6,g189400,Athens,23°C
7,g187147,Paris,14°C
8,g187371,Cologne,12°C
9,g187497,Barcelona,20°C


In [17]:
df3=pd.merge(df2, destination_id_df, on=["Destination_Name"])

#### Merge Data Frames to collate Destination and City level data

In [18]:
destination_df=pd.merge(df3, city_id_df, on=["City"])

In [19]:
destination_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C
1,Parthenon,Athens,7200000,g189400-d198711,g189400,23°C
2,Eiffel Tower,Paris,7000000,g187147-d188151,g187147,14°C
3,Cologne Cathedral,Cologne,6000000,g187371-d192291,g187371,12°C
4,Sagrada Família,Barcelona,4500000,g187497-d190166,g187497,20°C
5,Kazan Kremlin,Kazan,2893300,g298520-d321110,g298520,12°C
6,Tower of London,London,2858336,g186338-d187547,g186338,13°C
7,Alhambra,Granada,2760000,g187441-d191078,g187441,20°C
8,Topkapı Palace,Istanbul,2720119,g293974-d294547,g293974,19°C
9,Chapultepec Castle,Mexico City,2661615,g150800-d183986,g150800,15°C


#### Crowdsourcing - A Google Form is created to carry out a survey regarding the popularity of the Torist Destinations selected.This survey is used to find out which tourist destination is most preferred across the following criterias - Preferred Destination for Family Holiday,Preferred Destination for Solo Trip,Preferred Destination for a Holiday with Friends and Preferred Destination for a Holiday with Spouse/Partner

In [20]:
crowd_src=pd.read_csv('C:/Users/sanyo/OneDrive - Indian School of Business/Term1/DC/Office_Hour/get_phones_data/get_phones_data/Crowd_Source.csv',sep=',')

In [21]:
crowd_src

Unnamed: 0,Destination_Name,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination
0,Alhambra,4%,6%,0%,2%
1,Chapultepec Castle,2%,4%,6%,0%
2,Cologne Cathedral,4%,2%,0%,4%
3,Edinburgh Castle,13%,8%,8%,6%
4,Eiffel Tower,12%,8%,4%,47%
5,Forbidden City,0%,4%,4%,4%
6,Kazan Kremlin,0%,2%,4%,0%
7,Machu Picchu,4%,21%,13%,0%
8,Moscow Kremlin,4%,4%,0%,4%
9,None of These,8%,17%,4%,8%


#### Merge the Crowd sourced data with the existing data frame for Destinations

In [22]:
destination_popularity_df=pd.merge(destination_df, crowd_src, on=["Destination_Name"])

In [23]:
destination_popularity_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%
1,Parthenon,Athens,7200000,g189400-d198711,g189400,23°C,8%,6%,6%,14%
2,Eiffel Tower,Paris,7000000,g187147-d188151,g187147,14°C,12%,8%,4%,47%
3,Cologne Cathedral,Cologne,6000000,g187371-d192291,g187371,12°C,4%,2%,0%,4%
4,Sagrada Família,Barcelona,4500000,g187497-d190166,g187497,20°C,2%,4%,19%,2%
5,Kazan Kremlin,Kazan,2893300,g298520-d321110,g298520,12°C,0%,2%,4%,0%
6,Tower of London,London,2858336,g186338-d187547,g186338,13°C,27%,12%,10%,4%
7,Alhambra,Granada,2760000,g187441-d191078,g187441,20°C,4%,6%,0%,2%
8,Topkapı Palace,Istanbul,2720119,g293974-d294547,g293974,19°C,8%,0%,4%,4%
9,Chapultepec Castle,Mexico City,2661615,g150800-d183986,g150800,15°C,2%,4%,6%,0%


#### Scrape the following Tourist Destination data from TripAdvisor website for the selected destinations:Destination Description, Destination Type, Destination Rating, Destination Number of Reviews

In [24]:
driver = webdriver.Chrome(executable_path='C:/Users/sanyo/chromedriver.exe')
seed_url = "https://www.tripadvisor.in"
driver.get(seed_url)
time.sleep(10)
soup1 = BeautifulSoup(driver.page_source, 'html5lib')

k=[]
attraction_key_lst=[]
attraction_key_lst=["https://www.tripadvisor.in/Attraction_Review-g186338-d187547-Reviews-Tower_of_London-London_England.html","https://www.tripadvisor.in/Attraction_Review-g187514-d190146-Reviews-Royal_Palace_of_Madrid-Madrid.html","https://www.tripadvisor.in/Attraction_Review-g186525-d187653-Reviews-Edinburgh_Castle-Edinburgh_Scotland.html","https://www.tripadvisor.in/Attraction_Review-g293974-d294547-Reviews-Topkapi_Palace-Istanbul.html","https://www.tripadvisor.in/Attraction_Review-g294212-d319086-Reviews-Forbidden_City_The_Palace_Museum-Beijing.html","https://www.tripadvisor.in/Attraction_Review-g298520-d321110-Reviews-Kazan_Kremlin-Kazan_Republic_of_Tatarstan_Volga_District.html","https://www.tripadvisor.in/Attraction_Review-g189400-d198711-Reviews-Parthenon-Athens_Attica.html","https://www.tripadvisor.in/Attraction_Review-g187147-d188151-Reviews-Eiffel_Tower-Paris_Ile_de_France.html","https://www.tripadvisor.in/Attraction_Review-g187371-d192291-Reviews-Cologne_Cathedral-Cologne_North_Rhine_Westphalia.html","https://www.tripadvisor.in/Attraction_Review-g187497-d190166-Reviews-Basilica_of_the_Sagrada_Familia-Barcelona_Catalonia.html","https://www.tripadvisor.in/Attraction_Review-g294318-d668949-Reviews-Santuario_Historico_de_Machu_Picchu-Machu_Picchu_Sacred_Valley_Cusco_Region.html","https://www.tripadvisor.in/Attraction_Review-g150800-d183986-Reviews-Chapultepec_Castle-Mexico_City_Central_Mexico_and_Gulf_Coast.html","https://www.tripadvisor.in/Attraction_Review-g298484-d300392-Reviews-Moscow_Kremlin-Moscow_Central_Russia.html","https://www.tripadvisor.in/Attraction_Review-g187441-d191078-Reviews-The_Alhambra-Granada_Province_of_Granada_Andalucia.html"]
temp = []
temp1=[]
attraction_type = []
attraction_desc = []
attr_id=[]
attraction_rating=[]
attraction_rev=[]
cnt=0
cnt1=0


for k in attraction_key_lst:
    driver.get(k)
    time.sleep(10)
    soup1 = BeautifulSoup(driver.page_source, 'html.parser')  
      
    for name in soup1.find_all('div',attrs = {"class":"dYtkw"}):
        var1=name.text.strip()
        if name in [None]:
            attraction_desc.append("Not Available")
            #print(j)
            substring=k[45:60]
            attr_id.append(substring)
            
        else:
            attraction_desc.append(var1)
            #print(j)
            substring=k[45:60]
            attr_id.append(substring)
            
    for tp in soup1.find_all('div',attrs = {"class":"dlzPP"}):
        var2=tp.text.strip()
        if tp in [None]:
            temp.append("Not Available")
        else:
            temp.append(var2)
            
    for ar in soup1.find_all('div',attrs = {"class":"WlYyy cPsXC fksET cMKSg"}):
        var3=ar.text.strip()
        if ar in [None]:
            attraction_rating.append("Not Available")   
        else:
            attraction_rating.append(var3)
            
    for arev in soup1.find_all('span',attrs = {"class":"WlYyy diXIH bGusc dDKKM"}):
        var4=arev.text.strip()
        if arev in [None]:
            temp1.append("Not Available")
            
        else:
            temp1.append(var4)
            
attraction_type = [temp[cnt] for cnt in range(1,len(temp)) if cnt % 4 == 2]
attraction_rev = [temp1[cnt1] for cnt1 in range(1,len(temp1)) if cnt1 % 2 == 1]
               
driver.quit()

In [25]:
import pandas as pd
df4 = {'Destination_Id':attr_id,'Destination_Description':attraction_desc,'Destination_Type':attraction_type,'Destination_rating':attraction_rating,'Destination_Number_Of_Reviews':attraction_rev}
attraction_df = pd.DataFrame.from_dict(df4)

In [26]:
attraction_df

Unnamed: 0,Destination_Id,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews
0,g186338-d187547,AboutDiscover London’s castle – a secure fortr...,Historic Sites • Points of Interest & Landmarks,4.5,"65,841 reviews"
1,g187514-d190146,"AboutLuxurious, over-the-top rococo palace wit...",Historic Sites • Architectural Buildings,4.5,"34,644 reviews"
2,g186525-d187653,AboutEdinburgh Castle is a world famous icon o...,Castles,4.5,"51,473 reviews"
3,g293974-d294547,AboutThis enormous palace was the Imperial res...,History Museums,4.5,"27,302 reviews"
4,g294212-d319086,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews"
5,g298520-d321110,AboutThe historic fortress dating back to the ...,Speciality Museums • Historic Sites • Architec...,5.0,"3,807 reviews"
6,g189400-d198711,AboutThe majestic ruins of an ancient Greek bu...,Historic Sites • Ancient Ruins • Architectural...,4.5,"16,878 reviews"
7,g187147-d188151,"AboutCompleted in 1889, this colossal landmark...",Points of Interest & Landmarks • Observation D...,4.5,"1,40,408 reviews"
8,g187371-d192291,"AboutThis giant Gothic cathedral, which until ...",Points of Interest & Landmarks • Architectural...,4.5,"21,850 reviews"
9,g187497-d190166,AboutThe Basilica of the Sagrada Familia is a ...,Points of Interest & Landmarks • Architectural...,4.5,"1,63,992 reviews"


#### Scrape the following Hotels data from TripAdvisor website for hotels in cities for the selected destinations:Hotel Name,Hotel Rating,Hotel Number of Reviews

In [27]:
driver = webdriver.Chrome(executable_path='C:/Users/sanyo/chromedriver.exe')
seed_url = "https://www.tripadvisor.in"
driver.get(seed_url)
time.sleep(10)
soup2 = BeautifulSoup(driver.page_source, 'html.parser')

city_key_lst=['-g186338-London_England','-g187514-Madrid','-g186525-Edinburgh_Scotland','-g293974-Istanbul','-g294212-Beijing','-g298520-Kazan_Republic_of_Tatarstan_Volga_District','-g189400-Athens_Attica','-g187147-Paris_Ile_de_France','-g187371-Cologne_North_Rhine_Westphalia','-g187497-Barcelona_Catalonia','-g294318-Machu_Picchu_Sacred_Valley_Cusco_Region','-g187441-Granada_Province_of_Granada_Andalucia','-g298484-Moscow_Central_Russia','-g150800-Mexico_City_Central_Mexico_and_Gulf_Coast']

Hotel = []
Hotel_url=[]
for name in soup2.find_all('a',{"href":"/Hotels"}):
    Hotel.append(name.text.strip())

for i in city_key_lst:
    Hotel_path=seed_url+'/'+Hotel[0]+i+'-Hotels.html'
    Hotel_url.append(Hotel_path)

j_lst=[]
hotel = []
loc_id = []
ratings = []
reviews = []

for j in Hotel_url:
    driver.get(j)
    time.sleep(10)
    soup2 = BeautifulSoup(driver.page_source, 'html.parser')
        
    
    for name in soup2.find_all('div',attrs = {"class":"listing_title"}):
        var5=name.text.strip()
        if name in [None]:
            hotel.append("Not Available")
            #print(j)
            substring=j[34:41]
            loc_id.append(substring)
            
        else:
            hotel.append(var5)
            #print(j)
            substring=j[34:41]
            loc_id.append(substring)
               
    for rating in soup2.find_all('a',attrs = {'class':'ui_bubble_rating'}):
        var6=rating['alt']
        if rating in [None]:
            ratings.append("Not Available")
        else:
            ratings.append(var6)       

    
    for review in soup2.find_all('a',{'class':'review_count'}):
        var7=review.text.strip()
        if review in [None]:
            reviews.append("Not Available")
        else:
            reviews.append(var7)
        
    
driver.quit()

In [28]:
import pandas as pd
df5 = {'City_Id':loc_id,'Hotel':hotel,'Hotel_Ratings':ratings,'Hotel_No_of_Reviews':reviews}
hotel_df = pd.DataFrame.from_dict(df5)

In [29]:
hotel_df

Unnamed: 0,City_Id,Hotel,Hotel_Ratings,Hotel_No_of_Reviews
0,g186338,Sponsored Bermonds Locke,4.5 of 5 bubbles,98 reviews
1,g186338,The Resident Covent Garden,5 of 5 bubbles,783 reviews
2,g186338,The Tower Hotel,4 of 5 bubbles,"11,087 reviews"
3,g186338,Leonardo Royal London Tower Bridge,4.5 of 5 bubbles,"1,545 reviews"
4,g186338,Montcalm Royal London House - City of London,5 of 5 bubbles,"2,574 reviews"
...,...,...,...,...
440,g150800,Hilton Garden Inn Mexico City Santa Fe,4 of 5 bubbles,132 reviews
441,g150800,City Express Plus Insurgentes Sur,4 of 5 bubbles,"1,102 reviews"
442,g150800,Hotel Ritz Mexico,3.5 of 5 bubbles,273 reviews
443,g150800,City Express Aeropuerto Ciudad de Mexico,3.5 of 5 bubbles,227 reviews


#### Merge the Tourist Destination data and Hotels data

In [30]:
destination_attraction=pd.merge(destination_popularity_df, attraction_df, on=["Destination_Id"])

In [31]:
destination_attraction

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews"
1,Parthenon,Athens,7200000,g189400-d198711,g189400,23°C,8%,6%,6%,14%,AboutThe majestic ruins of an ancient Greek bu...,Historic Sites • Ancient Ruins • Architectural...,4.5,"16,878 reviews"
2,Eiffel Tower,Paris,7000000,g187147-d188151,g187147,14°C,12%,8%,4%,47%,"AboutCompleted in 1889, this colossal landmark...",Points of Interest & Landmarks • Observation D...,4.5,"1,40,408 reviews"
3,Cologne Cathedral,Cologne,6000000,g187371-d192291,g187371,12°C,4%,2%,0%,4%,"AboutThis giant Gothic cathedral, which until ...",Points of Interest & Landmarks • Architectural...,4.5,"21,850 reviews"
4,Sagrada Família,Barcelona,4500000,g187497-d190166,g187497,20°C,2%,4%,19%,2%,AboutThe Basilica of the Sagrada Familia is a ...,Points of Interest & Landmarks • Architectural...,4.5,"1,63,992 reviews"
5,Kazan Kremlin,Kazan,2893300,g298520-d321110,g298520,12°C,0%,2%,4%,0%,AboutThe historic fortress dating back to the ...,Speciality Museums • Historic Sites • Architec...,5.0,"3,807 reviews"
6,Tower of London,London,2858336,g186338-d187547,g186338,13°C,27%,12%,10%,4%,AboutDiscover London’s castle – a secure fortr...,Historic Sites • Points of Interest & Landmarks,4.5,"65,841 reviews"
7,Alhambra,Granada,2760000,g187441-d191078,g187441,20°C,4%,6%,0%,2%,AboutThe second most visited site in Europe: t...,Historic Sites • Architectural Buildings • Cas...,4.5,"42,345 reviews"
8,Topkapı Palace,Istanbul,2720119,g293974-d294547,g293974,19°C,8%,0%,4%,4%,AboutThis enormous palace was the Imperial res...,History Museums,4.5,"27,302 reviews"
9,Chapultepec Castle,Mexico City,2661615,g150800-d183986,g150800,15°C,2%,4%,6%,0%,"AboutThis 18th-century palace, known for its i...",Castles,4.5,"10,628 reviews"


#### Scrape the following Destinations data from TripAdvisor website for the selected destinations:Destination Reviews

In [32]:
driver = webdriver.Chrome(executable_path='C:/Users/sanyo/chromedriver.exe')
seed_url = "https://www.tripadvisor.in"
driver.get(seed_url)
time.sleep(10)
soup3 = BeautifulSoup(driver.page_source, 'html.parser')

j=[]
attraction_key_lst=[]
attraction_key_lst=["https://www.tripadvisor.in/Attraction_Review-g186338-d187547-Reviews-Tower_of_London-London_England.html","https://www.tripadvisor.in/Attraction_Review-g187514-d190146-Reviews-Royal_Palace_of_Madrid-Madrid.html","https://www.tripadvisor.in/Attraction_Review-g186525-d187653-Reviews-Edinburgh_Castle-Edinburgh_Scotland.html","https://www.tripadvisor.in/Attraction_Review-g293974-d294547-Reviews-Topkapi_Palace-Istanbul.html","https://www.tripadvisor.in/Attraction_Review-g294212-d319086-Reviews-Forbidden_City_The_Palace_Museum-Beijing.html","https://www.tripadvisor.in/Attraction_Review-g298520-d321110-Reviews-Kazan_Kremlin-Kazan_Republic_of_Tatarstan_Volga_District.html","https://www.tripadvisor.in/Attraction_Review-g189400-d198711-Reviews-Parthenon-Athens_Attica.html","https://www.tripadvisor.in/Attraction_Review-g187147-d188151-Reviews-Eiffel_Tower-Paris_Ile_de_France.html","https://www.tripadvisor.in/Attraction_Review-g187371-d192291-Reviews-Cologne_Cathedral-Cologne_North_Rhine_Westphalia.html","https://www.tripadvisor.in/Attraction_Review-g187497-d190166-Reviews-Basilica_of_the_Sagrada_Familia-Barcelona_Catalonia.html","https://www.tripadvisor.in/Attraction_Review-g294318-d668949-Reviews-Santuario_Historico_de_Machu_Picchu-Machu_Picchu_Sacred_Valley_Cusco_Region.html","https://www.tripadvisor.in/Attraction_Review-g150800-d183986-Reviews-Chapultepec_Castle-Mexico_City_Central_Mexico_and_Gulf_Coast.html","https://www.tripadvisor.in/Attraction_Review-g298484-d300392-Reviews-Moscow_Kremlin-Moscow_Central_Russia.html","https://www.tripadvisor.in/Attraction_Review-g187441-d191078-Reviews-The_Alhambra-Granada_Province_of_Granada_Andalucia.html"]
attraction_review=[]
attr_id=[]


for j in attraction_key_lst:
    driver.get(j)
    time.sleep(10)
    soup3 = BeautifulSoup(driver.page_source, 'html.parser')  
    
            
    for attr_rev in soup3.find_all('div',attrs = {"class":"duhwe _T bOlcm"}):
        var8=attr_rev.text.strip()
        if attr_rev in [None]:
            attraction_review.append("Not Available")
            substring=j[45:60]
            attr_id.append(substring)
            
        else:
            attraction_review.append(var8)
            substring=j[45:60]
            attr_id.append(substring)
                
driver.quit()

In [33]:
import pandas as pd
df6 = {'Destination_Id':attr_id,'Destination_Top_Reviews':attraction_review}
destination_review = pd.DataFrame.from_dict(df6)

In [34]:
destination_review

Unnamed: 0,Destination_Id,Destination_Top_Reviews
0,g186338-d187547,Wonderful attraction especially seeing the Cro...
1,g186338-d187547,Beefeater tour is a must if you visit. laurenc...
2,g186338-d187547,A brilliant day out the beefeater tour is well...
3,g186338-d187547,As mentioned in a previous review. We took the...
4,g186338-d187547,Great visit here with teen and a 10 year old. ...
...,...,...
133,g187441-d191078,The Alhambra in October is an enchanting seque...
134,g187441-d191078,"I looked at pictures online before the trip, b..."
135,g187441-d191078,Amazing !! We opted for a guide her name is Ma...
136,g187441-d191078,"Stunning, amazing views and so much history. A..."


#### Merge the additional Destinations data

In [35]:
destination_attraction_df=pd.merge(destination_attraction, destination_review, on=["Destination_Id"])

In [36]:
destination_attraction_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews,Destination_Top_Reviews
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread..."
1,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews",Every time I'm in Beijing I try to visit the F...
2,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews",The Forbidden City is an amazing place that yo...
3,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews",It is a quintessential display of Chinese cult...
4,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews",The area is astonishing but we met some scamme...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews","Amazing trip, amazing place in Machu Picchu, C..."
134,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",It is a great sight well worth the visit. The ...
135,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",This is the reason we came to Peru and it did ...
136,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",What a great day in Machu Picchu!!!A magic pla...


#### Again, merge the Tourist Destination data and Hotels data after addition of reviews to Tourist Destination data

In [37]:
destination_hotel_attraction_df=pd.merge(destination_attraction_df, hotel_df, on=["City_Id"])

In [38]:
destination_hotel_attraction_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews,Destination_Top_Reviews,Hotel,Hotel_Ratings,Hotel_No_of_Reviews
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Sponsored Artyzen Habitat Dongzhimen Beijing,5 of 5 bubbles,95 reviews
1,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...","China World Summit Wing, Beijing",5 of 5 bubbles,"3,214 reviews"
2,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Shangri-La Beijing,5 of 5 bubbles,"8,489 reviews"
3,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",The Bulgari Hotel Beijing,5 of 5 bubbles,255 reviews
4,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Hotel Jen Beijing by Shangri-La,5 of 5 bubbles,"2,333 reviews"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4381,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,Machupicchu Packer,3.5 of 5 bubbles,55 reviews
4382,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,Quilla Ecologico Inn,4.5 of 5 bubbles,10 reviews
4383,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,Hotel Ferre Machu Picchu,4 of 5 bubbles,408 reviews
4384,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,Gran Paititi Hostal,2 of 5 bubbles,18 reviews


#### Scrape the following Restaurants data from TripAdvisor website for restaurants in cities for the selected destinations:Restaurant Name

In [39]:
driver = webdriver.Chrome(executable_path='C:/Users/sanyo/chromedriver.exe')
seed_url = "https://www.tripadvisor.in"
driver.get(seed_url)
time.sleep(10)
soup4 = BeautifulSoup(driver.page_source, 'html5lib')

city_key_lst=['-g186338-London_England','-g187514-Madrid','-g186525-Edinburgh_Scotland','-g293974-Istanbul','-g294212-Beijing','-g298520-Kazan_Republic_of_Tatarstan_Volga_District','-g189400-Athens_Attica','-g187147-Paris_Ile_de_France','-g187371-Cologne_North_Rhine_Westphalia','-g187497-Barcelona_Catalonia','-g294318-Machu_Picchu_Sacred_Valley_Cusco_Region','-g187441-Granada_Province_of_Granada_Andalucia','-g298484-Moscow_Central_Russia','-g150800-Mexico_City_Central_Mexico_and_Gulf_Coast']

Restaurant = []
Restaurant_url=[]
for name in soup4.find_all('a',{"href":"/Restaurants"}):
    Restaurant.append(name.text.strip())

for i in city_key_lst:
    Restaurant_path=seed_url+'/'+Restaurant[0]+i+'.html'
    Restaurant_url.append(Restaurant_path)

restaurant = []
loc_id = []
res_rev = []

for j in Restaurant_url:
    driver.get(j)
    time.sleep(10)
    soup4 = BeautifulSoup(driver.page_source, 'html.parser')
        
    
    for name in soup4.find_all('div',attrs = {"class":"OhCyu"}):
        var9=name.text.strip()
        if name in [None]:
            restaurant.append("Not Available")
            #print(j)
            substring=j[39:46]
            loc_id.append(substring)
            
        else:
            restaurant.append(var9)
            #print(j)
            substring=j[39:46]
            loc_id.append(substring)  
                        
driver.quit()

In [40]:
import pandas as pd
df8 = {'City_Id':loc_id,'Top_Restaurants':restaurant}
restaurant_df = pd.DataFrame.from_dict(df8)

In [41]:
restaurant_df

Unnamed: 0,City_Id,Top_Restaurants
0,g186338,Steak & Co. Garrick Street Covent Garden
1,g186338,1. Bonoo Indian Tapas
2,g186338,2. Hibox
3,g186338,3. Indian Room
4,g186338,4. Iznik
...,...,...
472,g150800,27. Dalis pizzeria italiana
473,g150800,28. Sonora Grill Amores
474,g150800,29. Salute Food & Bar
475,g150800,30. Sonora Grill Miyana


#### Merge Tourist Destination and Restaurants data

In [42]:
destination_restaurant_attraction_df=pd.merge(destination_attraction_df, restaurant_df, on=["City_Id"])

In [43]:
destination_restaurant_attraction_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews,Destination_Top_Reviews,Top_Restaurants
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Vege Wonder
1,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",1. King's Joy
2,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",2. Cafe Cha - Shangri-La Beijing
3,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",3. F Bistronome - FB
4,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",4. The View 3912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4689,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,26. El Mapi Restaurant
4690,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,27. Ayasqa
4691,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,28. Cafelu
4692,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,29. Dining Room & Bar


#### Merge the Tourist destinations, Hotels and Restaurants data. The output should be a data frame with 19 attributes

In [44]:
Destination_hotel_restaurant_df = pd.concat([destination_hotel_attraction_df, destination_restaurant_attraction_df], axis=0)

In [45]:
Destination_hotel_restaurant_df

Unnamed: 0,Destination_Name,City,Visitors_per_year,Destination_Id,City_Id,City_Average_Temperature,Preferance_Family_Holiday_Destination,Preferance_Solo_Trip_Destination,Preferance_Friends_Holiday_Destination,Preferance_Spouse/Partner_Holiday_Destination,Destination_Description,Destination_Type,Destination_rating,Destination_Number_Of_Reviews,Destination_Top_Reviews,Hotel,Hotel_Ratings,Hotel_No_of_Reviews,Top_Restaurants
0,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Sponsored Artyzen Habitat Dongzhimen Beijing,5 of 5 bubbles,95 reviews,
1,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...","China World Summit Wing, Beijing",5 of 5 bubbles,"3,214 reviews",
2,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Shangri-La Beijing,5 of 5 bubbles,"8,489 reviews",
3,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",The Bulgari Hotel Beijing,5 of 5 bubbles,255 reviews,
4,Forbidden City,Beijing,"17,000,000+",g294212-d319086,g294212,9°C,0%,4%,4%,4%,"AboutConsisting of more than 9,000 rooms and s...",Speciality Museums • Architectural Buildings •...,4.5,"14,097 reviews","Consisting of more than 9,000 rooms and spread...",Hotel Jen Beijing by Shangri-La,5 of 5 bubbles,"2,333 reviews",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4689,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,,,,26. El Mapi Restaurant
4690,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,,,,27. Ayasqa
4691,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,,,,28. Cafelu
4692,Machu Picchu,Cusco,1411276,g294318-d668949,g294318,13°C,4%,21%,13%,0%,AboutSuggested durationMore than 3 hoursFeatur...,Historic Sites • Ancient Ruins,5.0,"19,605 reviews",Thank you for visiting Machu Picchu. We hope y...,,,,29. Dining Room & Bar


#### Export the scraped data to excel

In [46]:
Destination_hotel_restaurant_df.to_excel(r'C:\Users\sanyo\OneDrive - Indian School of Business\Term1\DC\Office_Hour\get_phones_data\get_phones_data\Tourist_Destinations_Of_The_World_v4.xlsx',  index = False)