In [2]:
# download_dir="hotel-recommendations"

# !kaggle datasets download -d keshavramaiah/hotel-recommendation

# !unzip hotel-recommendation.zip -d $download_dir

# !rm hotel-recommendation.zip


hotel-recommendation.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  hotel-recommendation.zip
replace hotel-recommendations/Hotel_Room_attributes.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [5]:
import pandas as pd

In [6]:
data_dir_path = "hotel-recommendations"

details = (
    pd.read_csv(f"{data_dir_path}/Hotel_details.csv")
    .drop_duplicates(subset="hotelid")
    .set_index("hotelid")
)
attributes = pd.read_csv(
    f"{data_dir_path}/Hotel_Room_attributes.csv", index_col="id"
)
price = pd.read_csv(f"{data_dir_path}/hotels_RoomPrice.csv", index_col="id")

In [7]:
data = price.drop_duplicates(subset="refid", keep="last")[
    [
        "hotelcode",
        "roomtype",
        "onsiterate",
        "roomamenities",
        "maxoccupancy",
        "mealinclusiontype",
    ]
]

data["ratedescription"] = attributes.loc[data.index]["ratedescription"]
data = data.join(
    details[["hotelname", "city", "country", "starrating"]], on="hotelcode"
)
data = data.rename({"ratedescription": "roomdescription"}, axis=1)
data["mealsincluded"] = ~data["mealinclusiontype"].isnull()
data.pop("hotelcode")
data.pop("mealinclusiontype")
data = data.reset_index(drop=True)

print(f"shape: {data.shape}")

shape: (11922, 10)


In [8]:
data.columns.tolist()

['roomtype',
 'onsiterate',
 'roomamenities',
 'maxoccupancy',
 'roomdescription',
 'hotelname',
 'city',
 'country',
 'starrating',
 'mealsincluded']

In [9]:
data = data.rename(columns={
    "roomtype": "room_type",
    "onsiterate": "onsite_rate",
    "roomamenities": "room_amenities",
    "maxoccupancy": "max_occupancy",
    "hotelname": "hotel_name",
    "starrating": "star_rating",
    "mealsincluded": "meals_included"
})

In [10]:
data.columns.tolist()

['room_type',
 'onsite_rate',
 'room_amenities',
 'max_occupancy',
 'roomdescription',
 'hotel_name',
 'city',
 'country',
 'star_rating',
 'meals_included']

In [11]:
for col in data.columns:
    values = data[col].value_counts().index.tolist()
    print(f"col: {col}, unique_values_count: {len(values)}")
    print(f"Examples: {values[:5]}")
    print("-"*20, "\n")


col: room_type, unique_values_count: 2635
Examples: ['Double Room', 'Family Room', 'Vacation Home', 'Triple Room', 'Suite']
-------------------- 

col: onsite_rate, unique_values_count: 4084
Examples: [0.0, 95.03, 89.75, 126.71, 90.42]
-------------------- 

col: room_amenities, unique_values_count: 6612
Examples: ['Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;Telephone: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;TV [flat screen]: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;Heating: ;In-room safe box: ;Laptop workspace: ;Shower: ;Towels: ;TV [flat screen]: ;', 'Air conditioning: ;Heating: ;In-room safe box: ;Internet access – wireless: ;Laptop workspace: ;Shower: ;Towels: ;TV [flat screen]: ;']
-------------------- 

col: max_occupancy, unique_values_count: 19
Examples: [2, 1, 4, 3, 6]
-------------------- 

col: roomdescription

In [12]:
data.duplicated().value_counts()

False    10907
True      1015
Name: count, dtype: int64

In [13]:
data.drop_duplicates(inplace=True)

In [14]:
data.shape

(10907, 10)

In [15]:
data.head()

Unnamed: 0,room_type,onsite_rate,room_amenities,max_occupancy,roomdescription,hotel_name,city,country,star_rating,meals_included
0,Vacation Home,636.09,Air conditioning: ;Closet: ;Fireplace: ;Free W...,4,"Shower, Kitchenette, 2 bedrooms, 1 double bed ...",Pantlleni,Beddgelert,United Kingdom,3,False
1,Vacation Home,591.74,Air conditioning: ;Closet: ;Dishwasher: ;Firep...,4,"Shower, Kitchenette, 2 bedrooms, 1 double bed ...",Willow Cottage,Beverley,United Kingdom,3,False
2,"Guest room, Queen or Twin/Single Bed(s)",0.0,,2,,AC Hotel Manchester Salford Quays,Manchester,United Kingdom,4,False
3,Bargemaster King Accessible Room,379.08,Air conditioning: ;Free Wi-Fi in all rooms!: ;...,2,Shower,"Lincoln Plaza London, Curio Collection by Hilton",London,United Kingdom,4,True
4,Twin Room,156.17,Additional toilet: ;Air conditioning: ;Blackou...,2,"Room size: 15 m²/161 ft², Non-smoking, Shower,...",Ibis London Canning Town,London,United Kingdom,3,True


In [16]:
data.to_csv("hotel-recommendations-cleaned.csv", index=False)

### below to be used in prompting

In [17]:
import json, re
from pandas.core.common import flatten

In [18]:
item_desc_format = """Name: `{col_name}`
Description: 
Type: {col_type}
Distinct: {distinct_values_list}
"""

print(item_desc_format)

Name: `{col_name}`
Description: 
Type: {col_type}
Distinct: {distinct_values_list}



In [19]:
# data["room_type"].dtype

In [20]:
col_description_list = []
for col in data.columns:
    values = data[col].value_counts().index.tolist()
    text = item_desc_format.format(
        col_name = col,
        col_type = data[col].dtype,
        distinct_values_list = values[:20], 
    )
    col_description_list.append(text)
    
for item in col_description_list:
    print(item)
# print(json.dumps(col_description_list, indent=2, default=str))

Name: `room_type`
Description: 
Type: object
Distinct: ['Double Room', 'Family Room', 'Vacation Home', 'Triple Room', 'Suite', 'Twin Room', 'Quadruple Room', 'Superior Double Room', 'Junior Suite', 'Double or Twin Room', 'Deluxe Double Room', 'Standard Double Room', 'Single Room', 'Family Room (2 Adults + 2 Children)', 'Apartment', 'Triple', 'Comfort Double Room', 'Family Suite', 'Standard Twin Room', 'Single']

Name: `onsite_rate`
Description: 
Type: float64
Distinct: [0.0, 95.03, 126.71, 89.75, 90.42, 147.83, 84.48, 79.19, 100.32, 80.37, 116.16, 110.51, 158.39, 137.27, 105.59, 131.99, 104.54, 142.55, 75.35, 85.39]

Name: `room_amenities`
Description: 
Type: object
Distinct: ['Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;Telephone: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;Heating: ;In-room safe box: ;Laptop workspace: ;Shower: ;Towels: ;TV [flat screen]: ;', '

In [21]:
data["room_amenities"].notna

# sorted(metadata_df["material"].unique().tolist())
room_amenities = data["room_amenities"].value_counts().index.tolist()
room_amenities = [re.split(r"[:;]", s) for s in room_amenities]
room_amenities = list(set(flatten(room_amenities)))
room_amenities = sorted(list(set([s.strip() for s in room_amenities])))
print(room_amenities[:21])

['', 'Additional bathroom', 'Additional toilet', 'Air conditioning', 'Air purifier', 'Alarm clock', 'Bathrobes', 'Bathroom phone', 'Blackout curtains', 'Carbon monoxide detector', 'Carpeting', 'Cleaning products', 'Closet', 'Clothes dryer', 'Clothes rack', 'Coffee/tea maker', 'Complimentary tea', 'DVD/CD player', 'Daily housekeeping', 'Daily newspaper', 'Dart board']


In [22]:
data["roomdescription"].notna

# sorted(data["roomdescription"].unique().tolist())
roomdescription = data["roomdescription"].value_counts().index.tolist()
roomdescription = [re.split(r"[,&]", s) for s in roomdescription]
roomdescription = list(set(flatten(roomdescription)))
roomdescription = [s.split("or") for s in roomdescription]
roomdescription = list(set(flatten(roomdescription)))
roomdescription = [s.split("and") for s in roomdescription]
roomdescription = list(set(flatten(roomdescription)))
roomdescription = sorted(list(set([s.strip() for s in roomdescription])))
# roomdescription
print(roomdescription[:21])

['1', '1 bunk bed', '1 double bed', '1 futon', '1 king bed', '1 queen bed', '1 semi double bed', '1 single bed', '1 sofa bed', '1 super king bed', '10 bathrooms', '10 bunk beds', '10 queen beds', '12 double beds', '17 king beds', '2 bathrooms', '2 bedrooms', '2 bunk beds', '2 double beds', '2 futons', '2 king beds']
