# Bukit Vista Internship Project

In [29]:
import requests
import pandas as pd
import re
import numpy as np
import joblib

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Web Scrapping

Firstly, we want to scrap all of the data from the bukit vista website. The data consists of the hotel's name, the price's description, the number of bedroom, the number of bathroom, and extra description.

In [2]:
hotel, price, bedroom, bathroom, plus = [], [], [], [], []

for j in range(1,7):
    print(f'Proses iterasi ke {j}')
    if j == 1:
        url = 'https://www.bukitvista.com/search-results?location%5B0%5D&areas%5B0%5D&bedrooms'
    else:
        url = f'https://www.bukitvista.com/search-results/page/{j}?location%5B0%5D&areas%5B0%5D&bedrooms'

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    containers = soup.findAll('div', attrs = {'class': 'item-wrap item-wrap-v1 item-wrap-no-frame h-100'})

    for container in containers:
        hotel_name = container.find("h2", attrs={"class": "item-title"})
        hotel_price = container.find("li", attrs={"class": "item-price item-price-text"})
        bed = container.find("li", attrs={"class": "h-beds"})
        bath = container.find("li", attrs={"class": "h-baths"})
        hotel_plus = container.find("li", attrs={"class": "h-type"})

        hotel.append(hotel_name.text if hotel_name else None)
        price.append(hotel_price.text if hotel_price else None)
        bedroom.append(bed.text if bed else None)
        bathroom.append(bath.text if bath else None)
        plus.append(hotel_plus.text if hotel_plus else None)

Proses iterasi ke 1


  containers = soup.findAll('div', attrs = {'class': 'item-wrap item-wrap-v1 item-wrap-no-frame h-100'})


Proses iterasi ke 2
Proses iterasi ke 3
Proses iterasi ke 4
Proses iterasi ke 5
Proses iterasi ke 6


We notice that we need to extract the price, the currency, and the unit from the price description. Hence, we make the functions to extract all of them.

In [3]:
def extract_price(text):
  if text is not None:
    match = re.search(r'(USD|\$|Rp)\s?([\d.,]+)', text)
    if match:
        return int(match.group(2).replace('.', '').replace(',', ''))
  return None

def extract_currency(text):
  if text is not None:
    match = re.search(r'(USD|Rupiah|Rp)', text)
    if match:
        return match.group(0)
  return None

def extract_unit(text):
  if text is not None:
    match = re.search(r'(?:per|/)\s*(.+)', text, re.IGNORECASE)
    if match:
        return match.group(1)
  return None

Similarly, the other features will also extracted as follows.

In [4]:
hotel_name = [i.strip() for i in hotel]
price_list = [extract_price(i) for i in price]
currency = [extract_currency(i) for i in price]
units = [extract_unit(i) for i in price]
bedroom_list = [i.split()[1] for i in bedroom]
bathroom_list = [i.split()[1] for i in bathroom]

The obtained data, then, are combined into a dataframe and save it to the following variable.

In [5]:
hotel_df = pd.DataFrame({"Hotel": hotel_name, "Price": price_list, "Currency": currency,
                         "Unit": units, "Bedroom": bedroom_list, "Bathroom": bathroom_list,
                         "Plus": plus})
hotel_df

Unnamed: 0,Hotel,Price,Currency,Unit,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,258.0,,2 nights,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,USD,night,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,USD,Night,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,715.0,USD,2 nights,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,USD,night,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,USD,night,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,USD,night,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,USD,night,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,USD,night,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,USD,night,2,2.0,"Guest House, Villa"


# Data Preparation

In [6]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Hotel     51 non-null     object 
 1   Price     49 non-null     float64
 2   Currency  47 non-null     object 
 3   Unit      48 non-null     object 
 4   Bedroom   51 non-null     object 
 5   Bathroom  51 non-null     object 
 6   Plus      50 non-null     object 
dtypes: float64(1), object(6)
memory usage: 2.9+ KB


Now, we will do data cleaning. Since price is essential in this data, those null value in the price column will be deleted.

In [7]:
hotel_df.dropna(subset=['Price'], inplace=True)
hotel_df.reset_index(drop=True, inplace=True)
hotel_df

Unnamed: 0,Hotel,Price,Currency,Unit,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,258.0,,2 nights,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,USD,night,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,USD,Night,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,715.0,USD,2 nights,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,USD,night,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,USD,night,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,USD,night,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,USD,night,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,USD,night,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,USD,night,2,2.0,"Guest House, Villa"


Since the mode of the column unit is 'night' we fill the missing values of this column with that.

In [8]:
hotel_df['Unit'] = hotel_df['Unit'].fillna(hotel_df['Unit'].mode()[0])
hotel_df

Unnamed: 0,Hotel,Price,Currency,Unit,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,258.0,,2 nights,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,USD,night,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,USD,Night,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,715.0,USD,2 nights,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,USD,night,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,USD,night,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,USD,night,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,USD,night,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,USD,night,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,USD,night,2,2.0,"Guest House, Villa"


If we check the missing values in the column currency, we see that the price is compatible with USD instead of Rupiah because it is in tens not thousands. Thus, we fill this missing value with USD.

In [9]:
hotel_df['Currency'] = hotel_df['Currency'].fillna('USD')
hotel_df

Unnamed: 0,Hotel,Price,Currency,Unit,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,258.0,USD,2 nights,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,USD,night,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,USD,Night,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,715.0,USD,2 nights,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,USD,night,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,USD,night,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,USD,night,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,USD,night,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,USD,night,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,USD,night,2,2.0,"Guest House, Villa"


Next we want to uniformize the price into USD, so we divide the price by 16.380 if the currency is in Rupiah.

In [10]:
hotel_df['Price'] = hotel_df.apply(
    lambda row: round(row['Price'] / 16380, 2) if row['Currency'] == 'Rupiah' or row['Currency'] == 'Rp' else row['Price'],
    axis=1
)
hotel_df

Unnamed: 0,Hotel,Price,Currency,Unit,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,258.0,USD,2 nights,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,USD,night,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,USD,Night,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,715.0,USD,2 nights,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,USD,night,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,USD,night,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,USD,night,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,USD,night,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,USD,night,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,USD,night,2,2.0,"Guest House, Villa"


We also want to uniformize the price into per night price. Thus, we divide the price by 2 if the unit is 2 nights and 30 if the unit is month. Then, we drop the columns currency and unit.

In [11]:
def adjust_price(row):
    if '2 nights' in row['Unit']:
        return round(row['Price'] / 2, 2)
    elif 'Month' in row['Unit']:
        return round(row['Price'] / 30, 2)
    elif 'month' in row['Unit']:
        return round(row['Price'] / 30, 2)
    else:
        return row['Price']

hotel_df['Price'] = hotel_df.apply(adjust_price, axis=1)
hotel_df.drop(['Currency', 'Unit'], axis=1, inplace=True)
hotel_df

Unnamed: 0,Hotel,Price,Bedroom,Bathroom,Plus
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,129.0,2,2.0,"Amazing pool, Island life, Pool view, Surfing,..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,3,3.0,"Amazing pool, Island life, Pool view, Surfing,..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,4,4.0,"Amazing pool, Amazing View, Jungle View, Pool ..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,357.5,3,4.0,"Beachfront, Villa"
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,2,2.0,"Amazing pool, Golfing, Pool view, Villa"
5,Surfer’s Villa 4 Mins to Bingin & Dreamland Be...,118.0,2,2.5,"Pool view, Villa"
6,Private Pool Villa Minutes from Bingin Surf Ha...,165.0,2,2.0,"Pool view, Villa"
7,Ungasan Exquisite Villa w/ Rooftop & Private P...,167.0,3,3.5,"Amazing pool, Golfing, Surfing, Tropical, Villa"
8,Sun-Soaked Canggu Villa Perfect for Families,100.0,2,2.0,Villa
9,Sunny Exquisite Umalas Villa: 20 Minutes to Ca...,108.0,2,2.0,"Guest House, Villa"


Now, we separate the plus values into their own columns where 1 represents the existence of that extra and 0 otherwise. Afterwards, we can drop the original column.

In [12]:
hotel_df["Pool_View"] = hotel_df["Plus"].str.contains("Pool view", case=False).astype(int)
hotel_df["Villa"] = hotel_df["Plus"].str.contains("Villa", case=False).astype(int)
hotel_df["Amazing_Pool"] = hotel_df["Plus"].str.contains("Amazing pool", case=False).astype(int)
hotel_df["Golfing"] = hotel_df["Plus"].str.contains("Golfing", case=False).astype(int)
hotel_df["Surfing"] = hotel_df["Plus"].str.contains("Surfing", case=False).astype(int)
hotel_df["Tropical"] = hotel_df["Plus"].str.contains("Tropical", case=False).astype(int)
hotel_df["Guest_House"] = hotel_df["Plus"].str.contains("Guest House", case=False).astype(int)
hotel_df["Ocean_View"] = hotel_df["Plus"].str.contains("Ocean view", case=False).astype(int)
hotel_df["Beachfront"] = hotel_df["Plus"].str.contains("Beachfront", case=False).astype(int)
hotel_df["Tropical"] = hotel_df["Plus"].str.contains("Tropical", case=False).astype(int)
hotel_df["View"] = hotel_df["Plus"].str.contains("View", case=False).astype(int)
hotel_df["Residential"] = hotel_df["Plus"].str.contains("Residential", case=False).astype(int)
hotel_df["Jungle_View"] = hotel_df["Plus"].str.contains("Jungle View", case=False).astype(int)
hotel_df["Amazing_View"] = hotel_df["Plus"].str.contains("Amazing View", case=False).astype(int)
hotel_df["Island_Life"] = hotel_df["Plus"].str.contains("Island Life", case=False).astype(int)
hotel_df["Rice_Paddy_View"] = hotel_df["Plus"].str.contains("Rice paddy view", case=False).astype(int)
hotel_df["Style"] = hotel_df["Plus"].str.contains("Style", case=False).astype(int)
hotel_df = hotel_df.drop('Plus', axis=1)

hotel_df.head()

Unnamed: 0,Hotel,Price,Bedroom,Bathroom,Pool_View,Villa,Amazing_Pool,Golfing,Surfing,Tropical,Guest_House,Ocean_View,Beachfront,View,Residential,Jungle_View,Amazing_View,Island_Life,Rice_Paddy_View,Style
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,129.0,2,2,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0
1,Bingin Beach Hideaway: Group Villa with Pool &...,161.0,3,3,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,202.0,4,4,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,0
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,357.5,3,4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,84.0,2,2,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0


In [13]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Hotel            49 non-null     object 
 1   Price            49 non-null     float64
 2   Bedroom          49 non-null     object 
 3   Bathroom         49 non-null     object 
 4   Pool_View        49 non-null     int64  
 5   Villa            49 non-null     int64  
 6   Amazing_Pool     49 non-null     int64  
 7   Golfing          49 non-null     int64  
 8   Surfing          49 non-null     int64  
 9   Tropical         49 non-null     int64  
 10  Guest_House      49 non-null     int64  
 11  Ocean_View       49 non-null     int64  
 12  Beachfront       49 non-null     int64  
 13  View             49 non-null     int64  
 14  Residential      49 non-null     int64  
 15  Jungle_View      49 non-null     int64  
 16  Amazing_View     49 non-null     int64  
 17  Island_Life      4

In [14]:
hotel_df['Bedroom'] = hotel_df['Bedroom'].astype(float)
hotel_df['Bathroom'] = hotel_df['Bathroom'].astype(float)
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Hotel            49 non-null     object 
 1   Price            49 non-null     float64
 2   Bedroom          49 non-null     float64
 3   Bathroom         49 non-null     float64
 4   Pool_View        49 non-null     int64  
 5   Villa            49 non-null     int64  
 6   Amazing_Pool     49 non-null     int64  
 7   Golfing          49 non-null     int64  
 8   Surfing          49 non-null     int64  
 9   Tropical         49 non-null     int64  
 10  Guest_House      49 non-null     int64  
 11  Ocean_View       49 non-null     int64  
 12  Beachfront       49 non-null     int64  
 13  View             49 non-null     int64  
 14  Residential      49 non-null     int64  
 15  Jungle_View      49 non-null     int64  
 16  Amazing_View     49 non-null     int64  
 17  Island_Life      4

Since the data is limited, I attempt to scrap new data from traveloka.

In [15]:
url = 'https://www.traveloka.com/id-id/hotel/indonesia/region/bali-102746'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
card_info = soup.find_all('div', attrs = {'class': 'css-1dbjc4n r-14lw9ot r-awg2lu r-1dzdj1l r-rs99b7 r-1loqt21 r-18u37iz r-1udh08x r-1otgn73 r-1i6wzkk r-lrvibr'})
name, trave_points, jumlah_bintang, pengguna, harga = [], [], [], [], []

for card in card_info:
  hotel_name = card.find('a', attrs={'class':'css-4rbku5 r-13awgt0'})
  traveloka = card.find('div', attrs={'class':'css-901oao r-1gle5yg r-a5wbuh r-1b43r93 r-ovu0ai r-rjixqe r-fdjqy7'})
  stars = card.find_all('g', attrs = {'fill':'none'})
  customers = card.find('div', attrs = {'class':'css-901oao r-a5wbuh r-1b43r93 r-1kfrs79 r-rjixqe r-fdjqy7'})

  prices = card.find('div', attrs = {'class':'css-1dbjc4n r-obd0qt r-18afbma r-eqz5dr r-1wtj0ep r-ymttw5 r-1f1sjgu r-l0gwng'})
  price = prices.find('h2', attrs = {'class':'css-4rbku5 css-901oao r-1w9mtv9 r-1kfrs79'})


  name.append(hotel_name.text if hotel_name else None)
  trave_points.append(traveloka.text if traveloka else 0)
  jumlah_bintang.append(len(stars))
  pengguna.append(customers.text if customers else None)
  harga.append(price.text if price else None)

print(card_info)
print(name)
print(trave_points)
print(jumlah_bintang)
print(pengguna)
print(harga)

[]
[]
[]
[]
[]
[]


Unfortunately, the site traveloka has detected that I tried to scrape their data. I should have saved the data once I scraped it last time. Thus, currently, I cannot scrape the data anymore. I have checked other website such as tiket.com but they use infinite scrolling website so I cannot scrape their data. The other website, furthermore, such as travelio, airbnb, etc do not provide enough data to be scraped. Therefore, I decided to make a random data.

Nevertheless, this random data will follow the minimum and maximum possible price for the accomodation based on the existing data. I also tried to add weights to the number of bedroom and bathroom as well as the total features to make it more realistic.

In [16]:
hotel_df.describe()

Unnamed: 0,Price,Bedroom,Bathroom,Pool_View,Villa,Amazing_Pool,Golfing,Surfing,Tropical,Guest_House,Ocean_View,Beachfront,View,Residential,Jungle_View,Amazing_View,Island_Life,Rice_Paddy_View,Style
count,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0
mean,96.904694,2.102041,2.091837,0.122449,0.693878,0.102041,0.040816,0.061224,0.081633,0.306122,0.081633,0.061224,0.306122,0.183673,0.061224,0.020408,0.040816,0.020408,0.020408
std,69.152859,1.06546,1.029125,0.331201,0.465657,0.305839,0.199915,0.242226,0.276642,0.465657,0.276642,0.242226,0.465657,0.39123,0.242226,0.142857,0.199915,0.142857,0.142857
min,18.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,85.67,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,144.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,357.5,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
hotel_df = hotel_df.drop(['Hotel'], axis =1)

The random price will follow the following equation
$$
Price=BasePrice+(Bedroom \times W_1)+(Bathroom \times W_2)+(TotalFeatures \times W_3)+Noise
$$

In [18]:
num_samples = 2000

# Generate the random data of the number of bedroom and bathroom
bedroom = np.random.randint(1, 6, num_samples)  # 1-5 bedroom
bathroom = np.random.randint(1, 6, num_samples)  # 1-5 bathroom

binary_columns = [
    "Pool_View", "Villa", "Amazing_Pool", "Golfing", "Surfing", "Tropical", "Guest_House",
    "Ocean_View", "Beachfront", "View", "Residential", "Jungle_View",
    "Amazing_View", "Island_Life", "Rice_Paddy_View", "Style"
]

# Generate binary features (0 or 1)
binary_features = {col: np.random.randint(0, 2, num_samples) for col in binary_columns}

# Count the number of features
total_features = np.sum(list(binary_features.values()), axis=0)

# Creating the price randomly
base_price = 18  # the minimum price based on the actual data
price = (
    base_price +
    (bedroom * 25) +  # the more bedroom, the more expensive, so I assume the weight is 20 dollar
    (bathroom * 20) +  # I assume the weight for the number of bathroom is 15 dollar
    (total_features * 15) +  # the more features, the more expensive, I make the weight as 10 dollar
    np.random.randint(-10, 10, num_samples)  # some noises to make it more realistic
)

# Create the upper bound for the price
# the actual maximum price based on the data is 357.5 but I make some noises since it is possible to have all of the features
# that is, the number of features are 16 then 16*15 = 240, so 357.5+240 = 597.5 with some noises so it becomes 610
max_price = 610
price = np.minimum(price, max_price)

# Create the new DataFrame
data = {
    "Price": price,
    "Bedroom": bedroom,
    "Bathroom": bathroom,
    **binary_features
}

random_df = pd.DataFrame(data)
random_df.describe()

Unnamed: 0,Price,Bedroom,Bathroom,Pool_View,Villa,Amazing_Pool,Golfing,Surfing,Tropical,Guest_House,Ocean_View,Beachfront,View,Residential,Jungle_View,Amazing_View,Island_Life,Rice_Paddy_View,Style
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,272.7745,3.001,3.048,0.479,0.483,0.521,0.495,0.479,0.507,0.49,0.5075,0.494,0.492,0.5155,0.4975,0.489,0.502,0.4965,0.4975
std,55.326044,1.399992,1.416227,0.499684,0.499836,0.499684,0.5001,0.499684,0.500076,0.500025,0.500069,0.500089,0.500061,0.499885,0.500119,0.500004,0.500121,0.500113,0.500119
min,84.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,233.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,274.0,3.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,311.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,429.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The price seems reasonable as the maximum is not far from the actual data. Now, we combined the data.

In [19]:
combined_df = pd.concat([hotel_df, random_df], ignore_index=True)
combined_df.to_csv("final_data.csv", index=False)

# ML Training

In [21]:
final_df = pd.read_csv("https://drive.google.com/uc?id=1Z0jUl6Lq2sUgINQeKCZpNBgMy7H8h7Eo")
final_df.head()

Unnamed: 0,Price,Bedroom,Bathroom,Pool_View,Villa,Amazing_Pool,Golfing,Surfing,Tropical,Guest_House,Ocean_View,Beachfront,View,Residential,Jungle_View,Amazing_View,Island_Life,Rice_Paddy_View,Style
0,129.0,2.0,2.0,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0
1,161.0,3.0,3.0,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0
2,202.0,4.0,4.0,1,1,1,0,0,1,0,0,0,1,0,1,1,0,0,0
3,357.5,3.0,4.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,84.0,2.0,2.0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0


In [22]:
X = final_df.drop(['Price'], axis =1)
y = final_df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Bedroom,Bathroom,Pool_View,Villa,Amazing_Pool,Golfing,Surfing,Tropical,Guest_House,Ocean_View,Beachfront,View,Residential,Jungle_View,Amazing_View,Island_Life,Rice_Paddy_View,Style
1856,3.0,2.0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1
570,3.0,5.0,1,0,0,1,1,1,0,1,0,0,0,1,0,0,1,1
926,1.0,3.0,1,1,1,1,0,1,1,1,1,0,0,0,1,0,1,1
670,5.0,3.0,1,0,0,1,1,0,1,1,1,0,1,0,0,1,1,0
1673,5.0,1.0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1


In [23]:
lr_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
svr_model = SVR()

rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

svr_params = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}


In [24]:
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_train)
score = r2_score(y_train, y_pred)
print("Linear Regression Train Score:", score)

Linear Regression Train Score: 0.9531671007521788


In [25]:
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='r2')
rf_grid.fit(X_train, y_train)

rf_best_params = rf_grid.best_params_
rf_best_score = rf_grid.best_score_

print("Random Forest's Best Parameters:", rf_best_params)
print("Random Forest's Best Cross-validation Score:", rf_best_score)

rf_best = rf_grid.best_estimator_

Random Forest's Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest's Best Cross-validation Score: 0.8684489694101915


In [26]:
svr_grid = GridSearchCV(svr_model, svr_params, cv=5, scoring='r2')
svr_grid.fit(X_train, y_train)

svr_best_params = svr_grid.best_params_
svr_best_score = svr_grid.best_score_

print("Support Vector's Best Parameters:", svr_best_params)
print("Support Vector's Best Cross-validation Score:", svr_best_score)

svr_best = svr_grid.best_estimator_

Support Vector's Best Parameters: {'C': 100, 'epsilon': 0.2, 'kernel': 'linear'}
Support Vector's Best Cross-validation Score: 0.9515023930011839


# Model Evaluation

In [28]:
models = ['Linear Regression', 'Random Forest', 'Support Vector']
train_scores, test_scores = [], []

y_pred = lr_model.predict(X_train)
train_scores.append(r2_score(y_train, y_pred))
y_pred = lr_model.predict(X_test)
test_scores.append(r2_score(y_test, y_pred))

y_pred = rf_best.predict(X_train)
train_scores.append(r2_score(y_train, y_pred))
y_pred = rf_best.predict(X_test)
test_scores.append(r2_score(y_test, y_pred))

y_pred = svr_best.predict(X_train)
train_scores.append(r2_score(y_train, y_pred))
y_pred = svr_best.predict(X_test)
test_scores.append(r2_score(y_test, y_pred))

score_df = pd.DataFrame({'Model':models, 'Train Score':train_scores, 'Test Score':test_scores})
score_df

Unnamed: 0,Model,Train Score,Test Score
0,Linear Regression,0.953167,0.983753
1,Random Forest,0.978803,0.897975
2,Support Vector,0.951066,0.982187


Because linear regression has the best performance both in the train and test score then we select this model to be implemented using streamlit.

In [30]:
joblib.dump(lr_model, "linear_regression_model.pkl")

['linear_regression_model.pkl']