In [1]:
import re
import pandas as pd
from openai import OpenAI
import json
import ast

In [2]:
# Open and display the raw contents of the Osaka hotel search results file
file_path = "Rizal Park, Manila, National Capital Region, Philippines Hotel Search Results.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

# Display the first 3000 characters of the raw text to give an overview
# raw_text[:3000]


In [3]:
# Split the text into chunks based on repeating "Photo gallery for" which marks the start of hotel listings
hotel_blocks = re.split(r'\n\s*Photo gallery for ', raw_text)

# Remove the first block which is navigation/menu before the listings
hotel_blocks = hotel_blocks[1:]

# Re-add the 'Photo gallery for' text to each block for clarity
hotel_blocks = ['Photo gallery for ' + block.strip() for block in hotel_blocks]

# Create a DataFrame where each row corresponds to one hotel listing block
df_blocks = pd.DataFrame(hotel_blocks, columns=["Hotel Block"])


df_blocks


Unnamed: 0,Hotel Block
0,Photo gallery for Maria Clara - Kalaw Ave · Fi...
1,Photo gallery for Ocean view studio room (1bed...
2,"Photo gallery for Manila Chic London Street, c..."
3,Photo gallery for City Garden Suites Manila\n\...
4,Photo gallery for Anex Hotel-Manila\n\nRecepti...
...,...
343,Photo gallery for Cozy Nook. Across Greenbelt5...
344,Photo gallery for 1 BR Fully Furnished Condo a...
345,Photo gallery for 1 BR Fully Furnished Condo i...
346,Photo gallery for The Sphere Serviced Residenc...


In [4]:
# df_blocks.to_csv('df_blocks.csv',index = False)

In [5]:
api_key = 'sk-9c8a52fa5ba140608bb8484c3a27cffa'

In [7]:
hotel_blocks = df_blocks["Hotel Block"].tolist()

client = OpenAI(
    api_key= api_key,  # Your real key
    base_url="https://api.deepseek.com"
)

# Generalized, location-agnostic system prompt
system_prompt = (
    "You are processing a printed text block from an Expedia hotel search result page. "
    "Each block describes one or more hotels. Extract only visible hotel data, and return it as valid JSON.\n\n"
    "Each hotel must include:\n"
    "{\n"
    "  \"hotel_name\": \"\",\n"
    "  \"distance_miles\": \"\",            # Only the numeric value, e.g., \"0.4\". No units or locations.\n"
    "  \"guest_rating_out_of_10\": \"\",    # Only the numeric value, e.g., \"8.6\"\n"
    "  \"number_of_reviews\": \"\",         # Numeric only, e.g., \"1178\"\n"
    "  \"price_usd_total\": \"\",           # Only the number, e.g., \"85\". No '$', 'USD', or 'total'.\n"
    "  \"room_type_or_description\": \"\",\n"
    "  \"amenities\": [\"\", \"\"]          # List of amenities that are explicitly mentioned\n"
    "}\n\n"
    "Strict rules:\n"
    "- Do NOT include units, symbols, or extra words in numeric fields.\n"
    "- Do NOT guess. Use \"N/A\" if a field is not available.\n"
    "- Output must be a JSON object or list of objects. No markdown, code blocks, or comments."
)

results = []

for i, block in enumerate(hotel_blocks):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": block}
        ],
        temperature=0.0,
        stream=False
    )

    raw_text = response.choices[0].message.content.strip()
    print(f"\n🔹 Hotel Block {i} - Raw AI Output:\n{raw_text}\n")

    # Clean formatting: strip ```json and ``` if present
    cleaned = raw_text.strip("` \n")
    if cleaned.startswith("json"):
        cleaned = cleaned[4:].strip()

    try:
        parsed = ast.literal_eval(cleaned)
        if isinstance(parsed, dict):
            results.append(parsed)
        elif isinstance(parsed, list):
            results.extend(parsed)
        else:
            raise ValueError("Parsed content is not a dict or list")
    except Exception as e:
        print(f"❌ Parse error on block {i}: {e}")
        results.append({
            "hotel_name": "ERROR",
            "distance_from_osaka_castle_mi": "ERROR",
            "guest_rating_out_of_10": "ERROR",
            "number_of_reviews": "ERROR",
            "price_usd_total": "ERROR",
            "room_type_or_description": "ERROR",
            "amenities": ["ERROR"]
        })

# Save to DataFrame
df_structured = pd.DataFrame(results)
print("\n✅ Final structured hotel DataFrame preview:\n")
print(df_structured.head())



🔹 Hotel Block 0 - Raw AI Output:
```json
{
  "hotel_name": "Maria Clara - Kalaw Ave · Filipino Themed Luxury 2BR W/Free Airport Pickup",
  "distance_miles": "0.15",
  "guest_rating_out_of_10": "9.8",
  "number_of_reviews": "60",
  "price_usd_total": "185",
  "room_type_or_description": "Entire condo, Sleeps 6, 2 bedrooms, 1 bathroom",
  "amenities": ["Pool", "Kitchen"]
}
```


🔹 Hotel Block 1 - Raw AI Output:
```json
{
  "hotel_name": "Ocean view studio room (1bed)",
  "distance_miles": "0.27",
  "guest_rating_out_of_10": "2.0",
  "number_of_reviews": "1",
  "price_usd_total": "87",
  "room_type_or_description": "Entire apartment, Sleeps 2, 1 bedroom, 1 bathroom",
  "amenities": ["Pool", "Kitchen"]
}
```


🔹 Hotel Block 2 - Raw AI Output:
```json
{
  "hotel_name": "Manila Chic London Street, coffee bar Oceanview",
  "distance_miles": "0.27",
  "guest_rating_out_of_10": "8.0",
  "number_of_reviews": "1",
  "price_usd_total": "130",
  "room_type_or_description": "Entire condo, Sleeps 4,

In [8]:
df_structured = df_structured.dropna(subset=['price_usd_total'])
df_structured

Unnamed: 0,hotel_name,distance_miles,guest_rating_out_of_10,number_of_reviews,price_usd_total,room_type_or_description,amenities
0,Maria Clara - Kalaw Ave · Filipino Themed Luxu...,0.15,9.8,60,185,"Entire condo, Sleeps 6, 2 bedrooms, 1 bathroom","[Pool, Kitchen]"
1,Ocean view studio room (1bed),0.27,2.0,1,87,"Entire apartment, Sleeps 2, 1 bedroom, 1 bathroom","[Pool, Kitchen]"
2,"Manila Chic London Street, coffee bar Oceanview",0.27,8.0,1,130,"Entire condo, Sleeps 4, 1 bedroom, 1 bathroom",[Private kitchen]
3,City Garden Suites Manila,0.29,8.4,1001,55,,[]
4,Anex Hotel-Manila,0.3,7.0,616,38,,[]
...,...,...,...,...,...,...,...
345,Cozy Nook. Across Greenbelt5. WiFi.Pool.Netflix,3.28,6.0,1,81,"Entire condo, Sleeps 4, 1 bedroom, 1 bathroom","[Pool, Kitchen, WiFi, Netflix]"
346,1 BR Fully Furnished Condo across MOA with Poo...,3.29,8.6,20,64,"Entire condo, Sleeps 4, 1 bedroom, 1 bathroom","[Pool, Kitchen]"
347,1 BR Fully Furnished Condo in Makati with Pool...,3.29,9.6,4,64,"Entire condo, Sleeps 4, 1 bedroom, 1 bathroom","[Pool, Kitchen]"
348,The Sphere Serviced Residences Managed by HII,3.31,9.0,376,63,,[Pool]


In [9]:
df_structured.to_excel('Philippines_expedia_ai.xlsx',index = False)