### Data Processing and Storage

#### 01 - Import dataset from RDS

In [1]:
import boto3
import pandas as pd
from rds import create_rds_engine, get_rds_instance
from env import (
    AWS_PROFILE_NAME,
    RDS_DB_NAME,
    RDS_INSTANCE_NAME,
    RDS_PASSWORD,
    RDS_USER,
    DEFAULT_RDS_DB_TABLE,
)

profile_name = AWS_PROFILE_NAME
master_username = RDS_USER
master_password = RDS_PASSWORD
db_name = RDS_DB_NAME
db_instance_identifier = RDS_INSTANCE_NAME
default_table = DEFAULT_RDS_DB_TABLE

boto_session = boto3.Session(profile_name=profile_name)

rds_client = boto_session.client("rds")

endpoint = get_rds_instance(rds_client, db_instance_identifier)

engine = create_rds_engine(master_username, master_password, endpoint, db_name)

query = f"SELECT * FROM `{default_table}`"

df = pd.read_sql(query, con=engine)

RDS Instance founded


#### 02 - Add necessary column label_avg_price_per_room in dataset for data processing

In [2]:
def generate_avg(avg_price_per_room: int):
    if avg_price_per_room <= 85:
        return 1
    elif avg_price_per_room < 115:
        return 2
    else:
        return 3


df["label_avg_price_per_room"] = df["avg_price_per_room"].apply(generate_avg)

#### 03 - Transformation of categorical variables with dummies

In [3]:
df = pd.get_dummies(
    data=df,
    prefix=[
        "type_of_meal_plan",
        "room_type_reserved",
        "market_segment_type",
    ],
    columns=[
        "type_of_meal_plan",
        "room_type_reserved",
        "market_segment_type",
    ],
)

#### 04 - Drop columns after analyses

In [4]:
df.drop(
    columns=[
        "avg_price_per_room",
        "Booking_ID",
        "booking_status",
        "arrival_date",
        "lead_time",
        "no_of_weekend_nights",
        "no_of_week_nights",
        "required_car_parking_space",
        "repeated_guest",
        "no_of_previous_cancellations",
        "no_of_previous_bookings_not_canceled",
        "no_of_children",
    ],
    inplace=True,
)

#### 05 - Convert categorical variables with dummies in numeric variables   

In [5]:
df = df * 1

#### 06 - Create and insert processed dataset in a new table in RDS

In [6]:
df.to_sql(f"{default_table}-processed", con=engine, if_exists="append", index=False)

36275