## Merging the 2 Datasets

### Imports and Data Loading

#### Imports

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from pyspark.sql import SparkSession

#### Spark Initialization

In [3]:
# remove any old spark vars remaining
os.environ.pop("SPARK_MASTER", None)
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

# stop spark sessions remaining
try:
    from pyspark.sql import SparkSession
    spark = SparkSession.getActiveSession()
    if spark is not None:
        spark.stop()
except Exception as e:
    print("No active Spark session found.")

In [4]:
# new spark session that runs locally
spark = (
    SparkSession.builder
    .appName("HotelBookingDataCleaning")
    .master("local[*]")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

#### Data Reading and Schema Viewing

In [6]:
# read hotel_booking.csv
hotel_booking = spark.read.csv('datasets/hotel-booking.csv', header=True, inferSchema=True)
customer_reservations = spark.read.csv('datasets/customer-reservations.csv', header=True, inferSchema=True)

In [7]:
hotel_booking.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- booking_status: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_year: integer (nullable = true)
 |-- arrival_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- market_segment_type: string (nullable = true)
 |-- country: string (nullable = true)
 |-- avg_price_per_room: double (nullable = true)
 |-- email: string (nullable = true)



In [8]:
customer_reservations.printSchema()

root
 |-- Booking_ID: string (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_year: integer (nullable = true)
 |-- arrival_month: integer (nullable = true)
 |-- arrival_date: integer (nullable = true)
 |-- market_segment_type: string (nullable = true)
 |-- avg_price_per_room: double (nullable = true)
 |-- booking_status: string (nullable = true)



### Row Cleaning

In [9]:
# convert to pandas for plotting and filtering
hotel_pd = hotel_booking.toPandas()
customer_pd = customer_reservations.toPandas()

In [11]:
print(customer_pd.shape)

(36275, 10)


In [12]:
# filter by drop na datapoints and dropping duplicate rows
hotel_pd = hotel_pd.dropna()
hotel_pd = hotel_pd.drop_duplicates()

customer_pd = customer_pd.dropna()
customer_pd = customer_pd.drop_duplicates()

In [41]:
print(hotel_pd.shape)
print(customer_pd.shape)

(78297, 13)
(36275, 10)


### Column Cleaning

#### Schema

In [17]:
hotel_pd.head()

Unnamed: 0,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,market_segment_type,country,avg_price_per_room,email
0,Resort Hotel,0,342,2015,July,27,1,0,0,Direct,PRT,0.0,Ernest.Barnes31@outlook.com
1,Resort Hotel,0,737,2015,July,27,1,0,0,Direct,PRT,0.0,Andrea_Baker94@aol.com
2,Resort Hotel,0,7,2015,July,27,1,0,1,Direct,GBR,75.0,Rebecca_Parker@comcast.net
3,Resort Hotel,0,13,2015,July,27,1,0,1,Corporate,GBR,75.0,Laura_M@gmail.com
4,Resort Hotel,0,14,2015,July,27,1,0,2,Online TA,GBR,98.0,LHines@verizon.com


In [18]:
customer_pd.head()

Unnamed: 0,Booking_ID,stays_in_weekend_nights,stays_in_week_nights,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,avg_price_per_room,booking_status
0,INN00001,1,2,224,2017,10,2,Offline,65.0,Not_Canceled
1,INN00002,2,3,5,2018,11,6,Online,106.68,Not_Canceled
2,INN00003,2,1,1,2018,2,28,Online,60.0,Canceled
3,INN00004,0,2,211,2018,5,20,Online,100.0,Canceled
4,INN00005,1,1,48,2018,4,11,Online,94.5,Canceled


#### Column Renaming and Editing

In [24]:
selected_col_names = ["stays_in_weekend_nights",
    "stays_in_week_nights",
    "lead_time",
    "arrival_year",
    "arrival_month",
    "arrival_day",
    "market_segment_type",
    "avg_price_per_room",
    "booking_status"]

In [None]:
# rename arrival day columns
customer_pd = customer_pd.rename(columns={"arrival_date": "arrival_day"})
hotel_pd = hotel_pd.rename(columns={"arrival_date_day_of_month": "arrival_day"})

In [27]:
# arrival month to number in hotel_pd
month_to_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}
hotel_pd["arrival_month"] = hotel_pd["arrival_month"].map(month_to_num)

In [30]:
# booking_status text to bool (0 or 1)
booking_map = {
    "Canceled": 0,
    "Not_Canceled": 1
}
customer_pd["booking_status"] = customer_pd["booking_status"].map(booking_map)

In [31]:
hotel_pd[selected_col_names].head()

Unnamed: 0,stays_in_weekend_nights,stays_in_week_nights,lead_time,arrival_year,arrival_month,arrival_day,market_segment_type,avg_price_per_room,booking_status
0,0,0,342,2015,7,1,Direct,0.0,0
1,0,0,737,2015,7,1,Direct,0.0,0
2,0,1,7,2015,7,1,Direct,75.0,0
3,0,1,13,2015,7,1,Corporate,75.0,0
4,0,2,14,2015,7,1,Online TA,98.0,0


In [32]:
customer_pd[selected_col_names].head()

Unnamed: 0,stays_in_weekend_nights,stays_in_week_nights,lead_time,arrival_year,arrival_month,arrival_day,market_segment_type,avg_price_per_room,booking_status
0,1,2,224,2017,10,2,Offline,65.0,1
1,2,3,5,2018,11,6,Online,106.68,1
2,2,1,1,2018,2,28,Online,60.0,0
3,0,2,211,2018,5,20,Online,100.0,0
4,1,1,48,2018,4,11,Online,94.5,0


### Dataset Merging

In [38]:
# merge similar cols from both dfs into one df
merged_df = hotel_pd.merge(
    customer_pd,
    on=[
        "stays_in_weekend_nights",
        "stays_in_week_nights",
        "lead_time",
        "arrival_year",
        "arrival_month",
        "arrival_day",
        "market_segment_type",
        "avg_price_per_room",
        "booking_status"
    ]
    , how="outer") # outer: keeps every row

merged_df = merged_df[selected_col_names]

In [39]:
merged_df.head()

Unnamed: 0,stays_in_weekend_nights,stays_in_week_nights,lead_time,arrival_year,arrival_month,arrival_day,market_segment_type,avg_price_per_room,booking_status
0,0,0,0,2015,7,6,Direct,0.0,0
1,0,0,0,2015,7,16,Offline TA/TO,0.0,0
2,0,0,0,2015,7,20,Offline TA/TO,0.0,0
3,0,0,0,2015,7,20,Online TA,0.0,0
4,0,0,0,2015,7,28,Direct,0.0,0


In [40]:
merged_df.shape

(114572, 9)