# This is the Data Collection Notebook for Travel Advisor

In [97]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# chrome_options = Options()
# chrome_options.add_argument('--headless')  
import pandas as pd

### The Function below prepares the data of the top 12 cities

In [2]:
def scrape_tripadvisor_attractions(city, country, url_format):
    
    names = []
    chrome_options = Options()
    
    # Running Chrome in headless mode (without opening browser window)
    chrome_options.add_argument("--headless")  

    for i in range(0, 3001, 30):
        driver = webdriver.Chrome(options = chrome_options)
        url = url_format.format(i)
        driver.get(url)
        page_source = driver.page_source
        driver.quit()
       
        soup = BeautifulSoup(page_source, 'html.parser')

        name_elements = soup.find_all('div', {"class": "XfVdV o AIbhI"})
        
        for element in name_elements:
            
            # Checking if element exists before calling get_text()
            
            if element:
                names.append(element.get_text(strip=True))
                
    # Getting rid of numbers in front of names
    
    final_list = [item.split('.', 1)[1] for item in names]
    
    # Converting our list into a dataframe
    
    df = pd.DataFrame(list(final_list), columns=['Attraction'])
    df['City'] = f"{city}, {country}"

    return df

### London

In [3]:
london_df = scrape_tripadvisor_attractions("London", "United Kingdom", "https://www.tripadvisor.com/Attractions-g186338-Activities-oa{}-London_England.html")
print(london_df)

# Saving DF into a file
london_df.to_csv('/Users/lianaavagyan/Desktop/london_df.csv', index=False)

                          Attraction                    City
0                  Royal Opera House  London, United Kingdom
1                        HMS Belfast  London, United Kingdom
2                 London Underground  London, United Kingdom
3     Tottenham Hotspur Stadium Tour  London, United Kingdom
4                          Frameless  London, United Kingdom
...                              ...                     ...
1615               Rooftop Film Club  London, United Kingdom
1616                          Floris  London, United Kingdom
1617        The Great Exhibition Bar  London, United Kingdom
1618                   Big Bus Tours  London, United Kingdom
1619          Boom Battle Bar The O2  London, United Kingdom

[1620 rows x 2 columns]


### Paris

In [9]:
paris_df = scrape_tripadvisor_attractions("Paris", "France", "https://www.tripadvisor.com/Attraction_Products-g187147-o{}-Paris_Ile_de_France.html")
print(paris_df)

# Saving DF into a file
paris_df.to_csv('/Users/lianaavagyan/Desktop/paris_df.csv', index=False)

                                             Attraction           City
0                              Paris Ultimate Food Tour  Paris, France
1     Musée d’Orsay Orsay Museum Tour - Semi-Private...  Paris, France
2     Somme Battlefields from Paris with Australian ...  Paris, France
3     Louvre Museum Guided Tour Options with Entry T...  Paris, France
4                     Big Bus Paris Open Top Night Tour  Paris, France
...                                                 ...            ...
1615     Brussels to Paris Private Full-Day Return Trip  Paris, France
1616  Le Moulin Jaune & Castle of Rentilly - Private...  Paris, France
1617                   Wine Your Way Through the Marais  Paris, France
1618  Departure Transfer from Paris to Paris Airport...  Paris, France
1619  8 hours Paris City Tour with Seine River Cruis...  Paris, France

[1620 rows x 2 columns]


### Crete

In [24]:
crete_df = scrape_tripadvisor_attractions("Crete", "Greece", "https://www.tripadvisor.com/Attraction_Products-g189413-o{}-Crete.html")
print(crete_df)

# Saving DF into a file
crete_df.to_csv('/Users/lianaavagyan/Desktop/crete_df.csv', index=False)

                                            Attraction           City
0    Full-Day Private Adventure: Knossos, Zeus Cave...  Crete, Greece
1    Sailing Cruise to Dia Island with Meal and Wat...  Crete, Greece
2                     Samaria Gorge Hiking from Chania  Crete, Greece
3    Day Tour to Santorini Island from Heraklion Crete  Crete, Greece
4    Road-trip to Lasithi: Zeus Cave, Krasi, Vidian...  Crete, Greece
..                                                 ...            ...
945                                   South Crete Tour  Crete, Greece
946  WEST CRETE LONG TRIP TOUR up to 4 to 15 customers  Crete, Greece
947                 NORTH CRETE TOUR up to 4 customers  Crete, Greece
948  East Crete Around the Mountains Tour 4 to 15 c...  Crete, Greece
949  Transfer from Chania airport to Paleochora up ...  Crete, Greece

[950 rows x 2 columns]


### Bali

In [29]:
bali_df = scrape_tripadvisor_attractions("Bali", "Indonesia", "https://www.tripadvisor.com/Attractions-g294226-Activities-oa{}-Bali.html")
print(bali_df)

# Saving DF into a file
bali_df.to_csv('/Users/lianaavagyan/Desktop/bali_df.csv', index=False)

                          Attraction             City
0                      Waterbom Bali  Bali, Indonesia
1     Sacred Monkey Forest Sanctuary  Bali, Indonesia
2            Tegalalang Rice Terrace  Bali, Indonesia
3                           Bali Zoo  Bali, Indonesia
4                    Museum PASIFIKA  Bali, Indonesia
...                              ...              ...
1615          Bali Family Safe Tours  Bali, Indonesia
1616              Bali Volcano Tours  Bali, Indonesia
1617                 Karmen & Marius  Bali, Indonesia
1618       Morning Light Yoga Studio  Bali, Indonesia
1619                  Bali Different  Bali, Indonesia

[1620 rows x 2 columns]


### Rome

In [41]:
rome_df = scrape_tripadvisor_attractions("Rome", "Italy", "https://www.tripadvisor.com/Attractions-g187791-Activities-oa{}-Rome_Lazio.html")
print(rome_df)

# Saving DF into a file
rome_df.to_csv('/Users/lianaavagyan/Desktop/rome_df.csv', index=False)

                  Attraction         City
0                  Colosseum  Rome, Italy
1                   Pantheon  Rome, Italy
2             Trevi Fountain  Rome, Italy
3              Piazza Navona  Rome, Italy
4          Galleria Borghese  Rome, Italy
...                      ...          ...
1555         Al Punto Divino  Rome, Italy
1556    Italy Tours for Kids  Rome, Italy
1557            L'Alchimista  Rome, Italy
1558     Guia Turistica Roma  Rome, Italy
1559  Koala 2.0 Industry Aps  Rome, Italy

[1560 rows x 2 columns]


### Phuket

In [45]:
phuket_df = scrape_tripadvisor_attractions("Phuket", "Thailand", "https://www.tripadvisor.com/Attractions-g293920-Activities-oa{}-Phuket.html")
print(phuket_df)

# Saving DF into a file
phuket_df.to_csv('/Users/lianaavagyan/Desktop/phuket_df.csv', index=False)

                                Attraction              City
0                        Big Buddha Phuket  Phuket, Thailand
1                              Bangla Road  Phuket, Thailand
2            Green Elephant Sanctuary Park  Phuket, Thailand
3                             Banana Beach  Phuket, Thailand
4                               Kata Beach  Phuket, Thailand
...                                    ...               ...
1506                          Patong Beach  Phuket, Thailand
1507                    See Adventure Club  Phuket, Thailand
1508  Sanae Thai Outcall Massage In Phuket  Phuket, Thailand
1509      Sanaethai Phuket Outcall Massage  Phuket, Thailand
1510               Phuket Travel City Tour  Phuket, Thailand

[1511 rows x 2 columns]


### Sicily

In [46]:
sicily_df = scrape_tripadvisor_attractions("Sicily", "Italy", "https://www.tripadvisor.com/Attractions-g187886-Activities-oa{}-Sicily.html")
print(sicily_df)

# Saving DF into a file
sicily_df.to_csv('/Users/lianaavagyan/Desktop/sicily_df.csv', index=False)

                                 Attraction           City
0                          Valle dei Templi  Sicily, Italy
1                                   Ortigia  Sicily, Italy
2                                Mount Etna  Sicily, Italy
3               Ancient Theatre of Taormina  Sicily, Italy
4                      Spiaggia dei Conigli  Sicily, Italy
...                                     ...            ...
1555                     Chiesa del Carmine  Sicily, Italy
1556  Chiesa Santa Maria della Misericordia  Sicily, Italy
1557                          Forte Gonzaga  Sicily, Italy
1558          Chiesa di San Lorenzo Martire  Sicily, Italy
1559          Monumento al Cardinale Dusmet  Sicily, Italy

[1560 rows x 2 columns]


### Majorca

In [60]:
majorca_df = scrape_tripadvisor_attractions("Majorca", "Balearic Islands", "https://www.tripadvisor.com/Attractions-g187462-Activities-oa0{}-Majorca_Balearic_Islands.html")
print(majorca_df)

# Saving DF into a file
majorca_df.to_csv('/Users/lianaavagyan/Desktop/majorca_df.csv', index=False)

                           Attraction                       City
0                     Playa Cala Gran  Majorca, Balearic Islands
1                Playa de Ses Salines  Majorca, Balearic Islands
2                Formentor Lighthouse  Majorca, Balearic Islands
3        Mirador Es Colomer Formentor  Majorca, Balearic Islands
4                Jungle Parc Mallorca  Majorca, Balearic Islands
...                               ...                        ...
1132                    Sant Elm Yoga  Majorca, Balearic Islands
1133  Tablao Flamenco Alma (Mallorca)  Majorca, Balearic Islands
1134       Destination Services Spain  Majorca, Balearic Islands
1135          Vitalyacht's & Egeajets  Majorca, Balearic Islands
1136                    Lance Eriksen  Majorca, Balearic Islands

[1137 rows x 2 columns]


### Barcelona

In [62]:
barcelona_df = scrape_tripadvisor_attractions("Barcelona", "Spain", "https://www.tripadvisor.com/Attractions-g187497-Activities-oa{}-Barcelona_Catalonia.html")
print(barcelona_df)

# Saving DF into a file
barcelona_df.to_csv('/Users/lianaavagyan/Desktop/barcelona_df.csv', index=False)

                          Attraction              City
0     Basílica de la Sagrada Familia  Barcelona, Spain
1                        Casa Batlló  Barcelona, Spain
2       Gothic Quarter (Barri Gotic)  Barcelona, Spain
3            Palace of Catalan Music  Barcelona, Spain
4                         Parc Guell  Barcelona, Spain
...                              ...               ...
1615            Histories de Ciencia  Barcelona, Spain
1616  Centre Civic Cotxeres de Sants  Barcelona, Spain
1617             Taller de Chocolate  Barcelona, Spain
1618                 Casa Muley Afid  Barcelona, Spain
1619              Bcnjoy Party Tours  Barcelona, Spain

[1620 rows x 2 columns]


### Istanbul

In [74]:
istanbul_df = scrape_tripadvisor_attractions("Istanbul", "Turkey", "https://www.tripadvisor.com/Attractions-g293974-Activities-oa{}-Istanbul.html")
print(istanbul_df)

# Saving DF into a file
istanbul_df.to_csv('/Users/lianaavagyan/Desktop/istanbul_df.csv', index=False)

                      Attraction              City
0            Hagia Sophia Mosque  Istanbul, Turkey
1               Basilica Cistern  Istanbul, Turkey
2                    Blue Mosque  Istanbul, Turkey
3             Suleymaniye Mosque  Istanbul, Turkey
4           Sultanahmet District  Istanbul, Turkey
...                          ...               ...
1675    Village Park Country Bar  Istanbul, Turkey
1676                 ByButa Tour  Istanbul, Turkey
1677  Magnaura Tours - Day Tours  Istanbul, Turkey
1678                  Cinemarine  Istanbul, Turkey
1679                Mavi Marmara  Istanbul, Turkey

[1680 rows x 2 columns]


### Dubai

In [82]:
dubai_df = scrape_tripadvisor_attractions("Dubai", "United Arab Emirates", "https://www.tripadvisor.com/Attractions-g295424-Activities-oa{}-Dubai_Emirate_of_Dubai.html")
print(dubai_df)

# Saving DF into a file
dubai_df.to_csv('/Users/lianaavagyan/Desktop/dubai_df.csv', index=False)

                          Attraction                         City
0                 The Dubai Fountain  Dubai, United Arab Emirates
1                       Burj Khalifa  Dubai, United Arab Emirates
2     Atlantis Aquaventure Waterpark  Dubai, United Arab Emirates
3                     The Dubai Mall  Dubai, United Arab Emirates
4                       AURA SKYPOOL  Dubai, United Arab Emirates
...                              ...                          ...
1585              Desert Safari Deal  Dubai, United Arab Emirates
1586                       Chuk Palu  Dubai, United Arab Emirates
1587               Closet Case Dubai  Dubai, United Arab Emirates
1588                   Travelo Dubai  Dubai, United Arab Emirates
1589      Sun & Sky Tourism & Travel  Dubai, United Arab Emirates

[1590 rows x 2 columns]


### Vienna

In [91]:
vienna_df = scrape_tripadvisor_attractions("Vienna", "Austria", "https://www.tripadvisor.com/Attractions-g190454-Activities-oa{}-Vienna.html")
print(vienna_df)

# Saving DF into a file
vienna_df.to_csv('/Users/lianaavagyan/Desktop/vienna_df.csv', index=False)

                        Attraction             City
0                        Volksoper  Vienna, Austria
1                        Donaupark  Vienna, Austria
2     Spittelberg Christmas Market  Vienna, Austria
3               Ernst Fuchs Museum  Vienna, Austria
4          3D PicArt Museum Vienna  Vienna, Austria
...                            ...              ...
1195                  Wien Gitarre  Vienna, Austria
1196             Casino Kulinarium  Vienna, Austria
1197                      Rotknopf  Vienna, Austria
1198                    BTC Travel  Vienna, Austria
1199                       Xpatbus  Vienna, Austria

[1200 rows x 2 columns]


### Concatenating all the DFs we have

In [93]:
cities_list = [london_df, paris_df, crete_df, bali_df, rome_df, phuket_df,sicily_df, majorca_df, barcelona_df, istanbul_df, dubai_df, vienna_df]
cities_df = pd.concat(cities_list)
cities_df.head()

Unnamed: 0,Attraction,City
0,Royal Opera House,"London, United Kingdom"
1,HMS Belfast,"London, United Kingdom"
2,London Underground,"London, United Kingdom"
3,Tottenham Hotspur Stadium Tour,"London, United Kingdom"
4,Frameless,"London, United Kingdom"


In [94]:
# Viewing all 12 cities in the DF

cities_df['City'].unique()

array(['London, United Kingdom', 'Paris, France', 'Crete, Greece',
       'Bali, Indonesia', 'Rome, Italy', 'Phuket, Thailand',
       'Sicily, Italy', 'Majorca, Balearic Islands', 'Barcelona, Spain',
       'Istanbul, Turkey', 'Dubai, United Arab Emirates',
       'Vienna, Austria'], dtype=object)

In [95]:
# Saving to .CSV file

cities_df.to_csv('/Users/lianaavagyan/Desktop/capstone/cities_df')

### Analysis is continued in Modeling Notebook