In [1]:
# download_dir="hotel-recommendations"

# !kaggle datasets download -d keshavramaiah/hotel-recommendation

# !unzip hotel-recommendation.zip -d $download_dir

# !rm hotel-recommendation.zip


In [2]:
import pandas as pd

In [3]:
data_dir_path = "hotel-recommendations"

details = (
    pd.read_csv(f"{data_dir_path}/Hotel_details.csv")
    .drop_duplicates(subset="hotelid")
    .set_index("hotelid")
)
attributes = pd.read_csv(
    f"{data_dir_path}/Hotel_Room_attributes.csv", index_col="id"
)
price = pd.read_csv(f"{data_dir_path}/hotels_RoomPrice.csv", index_col="id")

In [4]:
data = price.drop_duplicates(subset="refid", keep="last")[
    [
        "hotelcode",
        "roomtype",
        "onsiterate",
        "roomamenities",
        "maxoccupancy",
        "mealinclusiontype",
    ]
]

data["ratedescription"] = attributes.loc[data.index]["ratedescription"]
data = data.join(
    details[["hotelname", "city", "country", "starrating"]], on="hotelcode"
)
data = data.rename({"ratedescription": "roomdescription"}, axis=1)
data["mealsincluded"] = ~data["mealinclusiontype"].isnull()
data.pop("hotelcode")
data.pop("mealinclusiontype")
data = data.reset_index(drop=True)

print(f"shape: {data.shape}")

shape: (11922, 10)


In [5]:
data.columns.tolist()

['roomtype',
 'onsiterate',
 'roomamenities',
 'maxoccupancy',
 'roomdescription',
 'hotelname',
 'city',
 'country',
 'starrating',
 'mealsincluded']

In [6]:
data = data.rename(columns={
    "roomtype": "room_type",
    "onsiterate": "onsite_rate",
    "roomamenities": "room_amenities",
    "maxoccupancy": "max_occupancy",
    "hotelname": "hotel_name",
    "starrating": "star_rating",
    "mealsincluded": "meals_included"
})

In [7]:
data.columns.tolist()

['room_type',
 'onsite_rate',
 'room_amenities',
 'max_occupancy',
 'roomdescription',
 'hotel_name',
 'city',
 'country',
 'star_rating',
 'meals_included']

In [8]:
for col in data.columns:
    values = data[col].value_counts().index.tolist()
    print(f"col: {col}, unique_values_count: {len(values)}")
    print(f"Examples: {values[:5]}")
    print("-"*20, "\n")


col: room_type, unique_values_count: 2635
Examples: ['Double Room', 'Family Room', 'Vacation Home', 'Triple Room', 'Suite']
-------------------- 

col: onsite_rate, unique_values_count: 4084
Examples: [0.0, 95.03, 89.75, 126.71, 90.42]
-------------------- 

col: room_amenities, unique_values_count: 6612
Examples: ['Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;Telephone: ;TV: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;In-room safe box: ;Shower: ;TV [flat screen]: ;', 'Air conditioning: ;Free Wi-Fi in all rooms!: ;Heating: ;In-room safe box: ;Laptop workspace: ;Shower: ;Towels: ;TV [flat screen]: ;', 'Air conditioning: ;Heating: ;In-room safe box: ;Internet access – wireless: ;Laptop workspace: ;Shower: ;Towels: ;TV [flat screen]: ;']
-------------------- 

col: max_occupancy, unique_values_count: 19
Examples: [2, 1, 4, 3, 6]
-------------------- 

col: roomdescription

In [9]:
data.duplicated().value_counts()

False    10907
True      1015
Name: count, dtype: int64

In [10]:
data.drop_duplicates(inplace=True)

In [11]:
data.shape

(10907, 10)

In [12]:
data.head()

Unnamed: 0,room_type,onsite_rate,room_amenities,max_occupancy,roomdescription,hotel_name,city,country,star_rating,meals_included
0,Vacation Home,636.09,Air conditioning: ;Closet: ;Fireplace: ;Free W...,4,"Shower, Kitchenette, 2 bedrooms, 1 double bed ...",Pantlleni,Beddgelert,United Kingdom,3,False
1,Vacation Home,591.74,Air conditioning: ;Closet: ;Dishwasher: ;Firep...,4,"Shower, Kitchenette, 2 bedrooms, 1 double bed ...",Willow Cottage,Beverley,United Kingdom,3,False
2,"Guest room, Queen or Twin/Single Bed(s)",0.0,,2,,AC Hotel Manchester Salford Quays,Manchester,United Kingdom,4,False
3,Bargemaster King Accessible Room,379.08,Air conditioning: ;Free Wi-Fi in all rooms!: ;...,2,Shower,"Lincoln Plaza London, Curio Collection by Hilton",London,United Kingdom,4,True
4,Twin Room,156.17,Additional toilet: ;Air conditioning: ;Blackou...,2,"Room size: 15 m²/161 ft², Non-smoking, Shower,...",Ibis London Canning Town,London,United Kingdom,3,True


In [13]:
data.to_csv("hotel-recommendations-cleaned.csv", index=False)

### To be used later in prompting

In [None]:
import boto3, json
from langchain.llms.bedrock import Bedrock
from langchain_core.output_parsers import JsonOutputParser

In [27]:
metadata_df = data.drop(
    columns=["room_type", "room_amenities", "roomdescription", "hotel_name"], 
    axis=1
)

llm = Bedrock(
    model_id="anthropic.claude-v2:1",
    client=boto3.client("bedrock-runtime"),
    model_kwargs={"max_tokens_to_sample": 512, "temperature": 0.0},
)

prompt = """\n\nHuman: Below is a table with information about hotel rooms.
Return a JSON list with an entry for each column. Enclose the result in ```json ```.
No additional explanation. Each entry should have 
'{"name": "column name", "description": "good detailed, factually correct column description", "type": "column data type"}"""
prompt += "\n\n" + str(metadata_df.head()) + "\n\nAssistant: "
# print(prompt)

response = llm.predict(prompt)

parser = JsonOutputParser()
attribute_info = parser.parse(response)
print(json.dumps(attribute_info, indent=2, default=str))

[
  {
    "name": "onsite_rate",
    "description": "The average daily rate for a room at this hotel",
    "type": "float"
  },
  {
    "name": "max_occupancy",
    "description": "The maximum number of guests allowed per room at this hotel",
    "type": "integer"
  },
  {
    "name": "city",
    "description": "The city where this hotel is located",
    "type": "string"
  },
  {
    "name": "country",
    "description": "The country where this hotel is located",
    "type": "string"
  },
  {
    "name": "star_rating",
    "description": "The star rating for this hotel, on a scale of 1 to 5 stars",
    "type": "integer"
  },
  {
    "name": "meals_included",
    "description": "Whether meals are included in the room rate at this hotel",
    "type": "boolean"
  }
]


NameError: name 'json' is not defined