In [7]:
from pathlib import Path
import pandas as pd
import re
from openai import OpenAI
import ast

In [8]:
# Load the Booking.com Osaka Castle text file
file_path = Path("Booking.com_ Search results_ Rizal Park, Manila, Luzon, Philippines. Book your hotel now!.txt")
with open(file_path, "r", encoding="utf-8") as file:
    booking_text = file.read()

# Improved chunking: Split before "Show on map <https://www.booking.com/hotel/"
pattern = r"(?=Show on map <https://www\.booking\.com/hotel/)"
hotel_blocks = re.split(pattern, booking_text)

# Remove small/trivial entries
hotel_blocks = [block.strip() for block in hotel_blocks if len(block.strip()) > 300]

# Store in DataFrame
df_blocks = pd.DataFrame({"Hotel Block": hotel_blocks})
df_blocks = df_blocks.iloc[1:].reset_index(drop=True)
df_blocks

Unnamed: 0,Hotel Block
0,Show on map <https://www.booking.com/hotel/ph/...
1,Show on map <https://www.booking.com/hotel/ph/...
2,Show on map <https://www.booking.com/hotel/ph/...
3,Show on map <https://www.booking.com/hotel/ph/...
4,Show on map <https://www.booking.com/hotel/ph/...
...,...
995,Show on map <https://www.booking.com/hotel/ph/...
996,Show on map <https://www.booking.com/hotel/ph/...
997,Show on map <https://www.booking.com/hotel/ph/...
998,Show on map <https://www.booking.com/hotel/ph/...


In [9]:
api_key = 'sk-9c8a52fa5ba140608bb8484c3a27cffa'

In [10]:
# Initialize DeepSeek API
client = OpenAI(
    api_key= api_key,  # Replace with your key
    base_url="https://api.deepseek.com"
)

# General prompt for Booking.com-style hotel chunks
system_prompt = (
    "You are processing a printed text block from a Booking.com search result. "
    "Each block contains the description of one hotel. Extract only the visible data. "
    "Return the result as valid JSON (object or list of objects).\n\n"
    "Each hotel must contain:\n"
    "{\n"
    "  \"hotel_name\": \"\",\n"
    "  \"distance_miles\": \"\",            # Numeric only. No 'mi', no location names\n"
    "  \"guest_rating_out_of_10\": \"\",    # e.g. '8.6'\n"
    "  \"number_of_reviews\": \"\",         # e.g. '112'\n"
    "  \"price_usd_total\": \"\",           # e.g. '75'. No '$', no 'USD', no 'total'\n"
    "  \"room_type_or_description\": \"\",\n"
    "  \"amenities\": [\"\", \"\"]\n"
    "}\n\n"
    "Strict rules:\n"
    "- All numeric fields must be clean numbers as strings.\n"
    "- Do NOT include units or extra words.\n"
    "- Use \"N/A\" if the value is missing."
)


In [11]:
# Load hotel blocks
hotel_blocks = df_blocks["Hotel Block"].tolist()

results = []

for i, block in enumerate(hotel_blocks):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": block}
        ],
        temperature=0.0,
        stream=False
    )

    raw = response.choices[0].message.content.strip()
    print(f"\n🔹 Hotel Block {i} - Raw AI Output:\n{raw}\n")

    cleaned = raw.strip("` \n")
    if cleaned.startswith("json"):
        cleaned = cleaned[4:].strip()

    try:
        parsed = ast.literal_eval(cleaned)
        if isinstance(parsed, dict):
            results.append(parsed)
        elif isinstance(parsed, list):
            results.extend(parsed)
        else:
            raise ValueError("Parsed output is not a dict or list")
    except Exception as e:
        print(f"❌ Parse error on block {i}: {e}")
        results.append({
            "hotel_name": "ERROR",
            "distance_miles": "ERROR",
            "guest_rating_out_of_10": "ERROR",
            "number_of_reviews": "ERROR",
            "price_usd_total": "ERROR",
            "room_type_or_description": "ERROR",
            "amenities": ["ERROR"]
        })

# Save to DataFrame
df_booking_structured = pd.DataFrame(results)
print("\n✅ Final structured DataFrame preview:")
print(df_booking_structured.head())



🔹 Hotel Block 0 - Raw AI Output:
```json
[
  {
    "hotel_name": "Comfy Inn Manila Kalaw by Reddoorz",
    "distance_miles": "0",
    "guest_rating_out_of_10": "8.0",
    "number_of_reviews": "1",
    "price_usd_total": "39",
    "room_type_or_description": "Twin Room",
    "amenities": ["2 twin beds"]
  },
  {
    "hotel_name": "Eton Baypark Manila Studio Unit By Tripleview Condominium",
    "distance_miles": "N/A",
    "guest_rating_out_of_10": "N/A",
    "number_of_reviews": "N/A",
    "price_usd_total": "N/A",
    "room_type_or_description": "N/A",
    "amenities": []
  }
]
```


🔹 Hotel Block 1 - Raw AI Output:
```json
{
  "hotel_name": "Eton Baypark Manila by Tripleview 2 Lovely Unit",
  "distance_miles": "0",
  "guest_rating_out_of_10": "6.8",
  "number_of_reviews": "7",
  "price_usd_total": "39",
  "room_type_or_description": "Studio Apartment with Queen Bed",
  "amenities": [
    "Entire apartment",
    "1 bedroom",
    "1 living room",
    "1 bathroom",
    "1 kitchen",
    

In [12]:
df_booking_structured = df_booking_structured[df_booking_structured["price_usd_total"] != "N/A"].reset_index(drop=True)
df_booking_structured

Unnamed: 0,hotel_name,distance_miles,guest_rating_out_of_10,number_of_reviews,price_usd_total,room_type_or_description,amenities
0,Comfy Inn Manila Kalaw by Reddoorz,0,8.0,1,39,Twin Room,[2 twin beds]
1,Eton Baypark Manila by Tripleview 2 Lovely Unit,0,6.8,7,39,Studio Apartment with Queen Bed,"[Entire apartment, 1 bedroom, 1 living room, 1..."
2,Eton Baypark,0,6.9,290,40,Studio Apartment,"[Entire apartment, 1 bedroom, 1 bathroom, 1 ki..."
3,Stunning Sunset Retreat at Eton Bayview II 2pax,0,,,79,One-Bedroom Apartment,"[2 beds (1 queen, 1 futon)]"
4,Luxury Condo in Manila Bay near US Embassy,,,,495,,[]
...,...,...,...,...,...,...,...
1049,Cozy Bavaria by Cozy Cabins PH,2.2,,,35,"Apartment with Pool View, 2 beds (1 full, 1 so...",[]
1050,Shore 1 Residence Pasay,2.2,,,47,Apartment with Pool View,"[2 beds (1 full, 1 sofa bed)]"
1051,Condotel Pasay City,2.2,10,1,52,Deluxe Double Room with Sea View,[1 full bed]
1052,Duxe Suite8- near MOA & NAIA Airport Wifi Netflix,2.2,,,58,"Suite with Balcony, Private suite • 1 bedroom ...",[Free cancellation]


In [14]:
df_booking_structured.to_excel('Booking.com_Analysis_Philippine_ai.xlsx', index = False)