# Business Project of Alessandro Derchi 
## June 29th 2021 
## Programming with Advanced Computer Languages


## 1. Setup 
In order to run the code please follow the steps:

Please download the following libraries:

In [None]:
import requests
import bs4
import pandas as pd

In [None]:
url = "https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&source=structured_search_input_header&search_type=filter_change&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=july&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&checkin=2021-07-07&checkout=2021-07-10&adults=2"

With the function get_page it should take the url as input and return its underlying HTML code as a BeautifulSoup object as output. The required libraries (requests) and (bs4) need to be also imported. 

In [None]:
def get_page(url):
    response = requests.get(url)
    return bs4.BeautifulSoup(response.text, 'html.parser')

soup = get_page(url)
soup

A brief look at the given webpage shows that the information on the different listings is shown in a list form.
For every listing a preview image is shown together with some standard information (title, rating, price, etc.)

The get_listings function should take a BeautifulSoup object containing the code for a whole webpage as input and return a list of the individual pieces of code for each listing.

In [None]:
listing_class = "_8ssblpx"
listing_tag = "div"

def get_listings(soup):
    return soup.find_all(listing_tag,{"class": listing_class})

get_listings(soup)[0]

## 2. Retrieving the data

Now that the code for the separate listings is retrieved, we want to retrieve separate information from each listing.

For each part of information that we can retrieve from the preview image on Airbnb, we will use functions for each part of information. To check if the code also works we will get as output the relevant information for each listing.

1. Title

In [None]:
title_class = "_5kaapu"
title_tag = "div"

def get_listing_title(listing):
    try:
        return listing.find(title_tag, {"class": title_class}).text
    except: 
        return False

get_listing_title(get_listings(soup)[0])

2. Type of property


In [None]:
property_class = "_1tanv1h"
property_tag = "div"

def get_listing_property(listing):
    try:
        mystring = listing.find(property_tag, {"class": property_class}).text
        before_keyword, keyword, after_keyword = mystring.partition(" in ")
        return before_keyword
    except: 
        return None
get_listing_property(get_listings(soup)[0])

3. Location


In [None]:
location_class = "_1tanv1h"
location_tag = "div"

def get_listing_location(listing):
    try:
        mystring = listing.find(location_tag, {"class": location_class}).text
        before_keyword, keyword, after_keyword = mystring.partition(" in ")
        return after_keyword
    except: 
        return None

get_listing_location(get_listings(soup)[0])

4. Info

In [None]:
info_class = "_3c0zz1"
info_tag = "div"

def get_listing_info(listing):
    try:
        return listing.find_all(info_tag, {"class": info_class})[0].text
    except: 
        return None

get_listing_info(get_listings(soup)[0])

5. Amenities

In [None]:
ammenities_class = "_3c0zz1"
ammenities_tag = "div"

def get_listing_ammenities(listing):
    try:
        return listing.find_all(ammenities_tag, {"class": ammenities_class})[1].text
    except: 
        return None

get_listing_ammenities(get_listings(soup)[0])

6. Rating 

In [None]:
rating_class = "_10fy1f8"
rating_tag = "span"

def get_listing_rating(listing):
    try:
        return float(listing.find(rating_tag, {"class": rating_class}).text)
    except:
        return None

get_listing_rating(get_listings(soup)[0])

7. Number of reviews

In [None]:
reviews_class = "_a7a5sx"
reviews_tag = "span"

def get_listing_reviews(listing):
    try:
        return int(listing.find(reviews_tag, {"class": reviews_class}).text[2:-1].strip(" reviews"))
    except:
        return None

get_listing_reviews(get_listings(soup)[0])

8. Price per night

In [None]:
price_per_night_class = "_1gi6jw3f"
price_per_night_tag = "div"

def get_listing_price_per_night(listing):
    try:
        return int(listing.find(price_per_night_tag, {"class": price_per_night_class}).text.split("$")[-1].strip("/ night"))
    except: 
        return None

get_listing_price_per_night(get_listings(soup)[0])

Next,we need a function to retrieve information of the next webpage of the current url. The function find_next_page takes a soup object containing the code for an individual page as input and returns the complete url for the next page. If there are no more pages left, it returns a None in boolean form. We need the base_url to set this up.

In [None]:
base_url = "https://airbnb.com"
next_page_class = "_za9j7e"
next_page_tag = "a"

def find_next_page(page):
    link = soup.find(next_page_tag, {"class": next_page_class})
    try: 
        return base_url + link["href"]
    except:
        return None

find_next_page(soup)

Next, we need to retrieve the data above for all listings in all webpages. We use a for loop to retrieve the information and store the information in lists.

In [None]:
all_listings = []
url = "https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&source=structured_search_input_header&search_type=filter_change&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=july&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&checkin=2021-07-07&checkout=2021-07-10&adults=2"
soup = get_page(url)

while True: 
    try:
        soup = get_page(url)
        for listing in get_listings(soup):
            all_listings.append(listing)
        url = find_next_page(soup)
    except:
        break

In [None]:
title = []
info = []
location = []
type_of_property = []
ammenities = []
rating = []
reviews = []
price_per_night = []

for listing in all_listings:
    title.append(get_listing_title(listing))
    location.append(get_listing_location(listing))
    type_of_property.append(get_listing_property(listing))
    info.append(get_listing_info(listing))
    ammenities.append(get_listing_ammenities(listing))
    rating.append(get_listing_rating(listing))
    reviews.append(get_listing_reviews(listing))
    price_per_night.append(get_listing_price_per_night(listing))

## 3. Saving the data

Next, in order to view all information we retrieved, we need to store it in a DataFrame.

We store the data in the DataFrame object and call it airbnb. The names of the different columns are equal to those of the lists we just created: title, location, type_of_property, info, ammenities, rating, reviews and price_per_night. However, for further analysis we do not need the title of the listing as it does not give us added value.

In [None]:
data = {'title': title,
        'location': location,
        'type_of_property': type_of_property,
        'ammenitites': ammenities,
        'info': info,
        'rating': rating,
        'reviews': reviews,
        'price_per_night': price_per_night,
        }

airbnb = pd.DataFrame(data = data)
airbnb

Sanity check: 

Here you can check the basic information of the dataframe and see how many entries there are for each column.

In [None]:
airbnb.info()

In order to set a benchmark for which properties we favour, we need to look at the average levels of the integer and floating numbers of our dataframe. We give for the average mean and rating a variable.

In [None]:
airbnb.mean(axis=0)

In [None]:
avg_rating = airbnb.mean(axis=0)[0]

In [None]:
avg_price_per_night = airbnb.mean(axis=0)[2]

Here, we can conduct how many types of these properties exist in our analysis that should to be taking into consideration for evaluation.

In [None]:
airbnb.groupby(['type_of_property'])['type_of_property'].count()


Since we want the highest possible return on investment we hope to receive a high price per night. Therefore, we will check which location brings most money by calculating the average of all integer and floating columns and set a descending order for price_per_night. 

In [None]:
airbnb_groupby = airbnb.groupby(by=["location"]).mean()
airbnb_groupby = airbnb_groupby.sort_values(by=['price_per_night'], ascending = False)
airbnb_groupby.head()

In [None]:
display(airbnb_groupby.loc[(airbnb_groupby['rating']> avg_rating) &
                           (airbnb_groupby['price_per_night']> avg_price_per_night)])

This dataframe is the first result of the best locations to consider.

Now it is important to know in what type of property the investors want to invest in. This is done by grouping the type of property and showed in descending order of the column "rating". 

In [None]:
airbnb_groupby2 = airbnb.groupby(by=["type_of_property"]).mean()
airbnb_groupby2 = airbnb_groupby2.sort_values(by=['rating'], ascending = False)
airbnb_groupby2.head()

Based on the assumption that the investors want to have a successful estate that gets high ratings we will consider only the ones that have a higher than average rating review. We also want to consider ratings with enough reviews (above 10 reviews) to consider the following types of property.

In [None]:
display(airbnb_groupby2.loc[(airbnb_groupby2['rating']> avg_rating) &
                           (airbnb_groupby2['reviews']>10)])

This is the second results to consider which types of properties to consider. 

Next we want to see which features lead to higher ratings in order to satisfy the tourists' expectations for their stay. 

# 4. Features of properties to consider

First, we need to gain more detailed information from get_listing_info and  get_listing_ammenities with the following code. Please note that due to feature selection we will disregard the feature bed as it it already described with the term bedrooms.

In [None]:
guests = []
bedrooms = []
baths = []

def get_listing_info_each(all_listings):
    info_each = []
    info_class = "_3c0zz1"
    info_tag = "div"
    for listing in all_listings:
        try:
            info_each.append(listing.find(info_tag, {"class": info_class}).text.split("·"))
        except:
              info_each.append(False)
    return info_each


for y in get_listing_info_each(all_listings):
    
    #for guests
    number_guest = y[0].split()[0]
    guests.append(int(number_guest))

    #for bedrooms
    number_bedrooms = y[1].split()[0]
    if number_bedrooms.isdigit():
        bedrooms.append(int(number_bedrooms))
    else:
        bedrooms.append(None)

    #for baths
    try: 
        number_baths = y[3].split()[0]
        baths.append(float(number_baths))
    except: 
        baths.append(None)

In [None]:
wifi = []
kitchen = []
air_conditioning = []
pool = []

for x in ammenities:
    if x:
        if "Wifi" in x: 
            wifi.append(1)
        else:
            wifi.append(0)
        if "Kitchen" in x: 
            kitchen.append(1)
        else:
            kitchen.append(0)
        if "Air conditioning" in x: 
            air_conditioning.append(1)
        else:
              air_conditioning.append(0)
        if "Pool" in x: 
            pool.append(1)
        else:
            pool.append(0)  
    else:
        wifi.append(None)
        kitchen.append(None)
        air_conditioning.append(None)
        pool.append(None)

Below you can find a Dataframe with the location and type of property per listing with more detailed information of the ameninities as well as other information that is important for the asset management company to consider: how many bedrooms, guests and baths.

In [None]:
import pandas as pd

data = {"location": location,
        "type_of_property": type_of_property,
        "rating": rating,
        "reviews": reviews,
        "price_per_night": price_per_night,
        "guests": guests, 
        "bedrooms": bedrooms,
        "baths": baths,
        "wifi": wifi,
        "kitchen": kitchen,
        "air_conditioning": air_conditioning,
        "pool": pool,
        }
airbnb2 = pd.DataFrame(data = data)
airbnb2

Next, we will group all entries by rating and sort them in descending order and look at the first 5 entries that lead to the closest 5 star rating.

In [None]:
airbnb2_groupby = airbnb2.groupby(by=["rating"]).mean()
airbnb2_groupby = airbnb2_groupby.sort_values(by=['rating'], ascending = False)
airbnb2_groupby.head()

This dataframe is the last result to consider which features and amenities to consider when investing in a property. 