In [1]:
import re
import ast
import pandas as pd
from pathlib import Path
from openai import OpenAI


In [2]:

# ========== 1. Load the Trip.com hotel search result text ==========
file_path = Path("Siem Reap Hotels - Where to stay in Siem Reap _ Trip.txt")
with open(file_path, "r", encoding="utf-8") as file:
    full_text = file.read()

# ========== 2. Chunk the text using "Show on Map" as hotel delimiter ==========
hotel_blocks = re.split(r"(?=Show on Map)", full_text)
hotel_blocks = [block.strip() for block in hotel_blocks if len(block.strip()) > 200]

# ========== 3. Create a DataFrame for review ==========
df_blocks = pd.DataFrame({"Hotel Block": hotel_blocks})
df_blocks 




Unnamed: 0,Hotel Block
0,<https://us.trip.com/?locale=en-US&curr=USD>\n...
1,Show on Map\n\nSearch Properties\n\n/\n\n/\nBu...
2,Show on Map\n *\n hotel overview picture\n...
3,Show on Map\n\n Deluxe Twin Room//////\n ...
4,Show on Map\n\n Deluxe Family Room////////\...
...,...
318,Show on Map\n\n One Bedroom Villa King Size...
319,Show on Map\n\n Superior Double Or Twin Roo...
320,Show on Map\n\n Cabana Room with Pool Acces...
321,Show on Map\n\n Soupier Room With Hot Tub//...


In [3]:
api_key = 'sk-9c8a52fa5ba140608bb8484c3a27cffa'

In [4]:
# ========== 3. Prepare DeepSeek API ==========
client = OpenAI(
    api_key= api_key,  # ← Replace this with your actual key
    base_url="https://api.deepseek.com"
)

system_prompt = (
    "You are processing a printed text block from a Trip.com hotel search result page. "
    "Each block describes one hotel. Extract clearly stated information only.\n\n"
    "Return valid JSON structured like this:\n"
    "{\n"
    "  \"hotel_name\": \"\",\n"
    "  \"distance_miles\": \"\",            # Only numeric, no 'mi' or location names\n"
    "  \"guest_rating_out_of_10\": \"\",    # e.g. '8.6'\n"
    "  \"number_of_reviews\": \"\",         # Numeric only\n"
    "  \"price_usd_total\": \"\",           # Numeric only, no $ or 'total'\n"
    "  \"room_type_or_description\": \"\",\n"
    "  \"amenities\": [\"\", \"\"]\n"
    "}\n\n"
    "Strict rules:\n"
    "- All numeric fields must be clean strings.\n"
    "- Do NOT include units, currency symbols, or extra text.\n"
    "- Use \"N/A\" if the value is not available.\n"
    "- Output must be valid JSON (single object or list of objects)."
)

# ========== 4. Send first 5 blocks to DeepSeek API ==========
results = []

for i, block in enumerate(hotel_blocks):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": block}
        ],
        temperature=0.0,
        stream=False
    )

    raw = response.choices[0].message.content.strip()
    print(f"\n🔹 Hotel Block {i} - Raw AI Output:\n{raw}\n")

    cleaned = raw.strip("` \n")
    if cleaned.startswith("json"):
        cleaned = cleaned[4:].strip()

    try:
        parsed = ast.literal_eval(cleaned)
        if isinstance(parsed, dict):
            results.append(parsed)
        elif isinstance(parsed, list):
            results.extend(parsed)
        else:
            raise ValueError("Parsed output is not a dict or list")
    except Exception as e:
        print(f"❌ Parse error on block {i}: {e}")
        results.append({
            "hotel_name": "ERROR",
            "distance_miles": "ERROR",
            "guest_rating_out_of_10": "ERROR",
            "number_of_reviews": "ERROR",
            "price_usd_total": "ERROR",
            "room_type_or_description": "ERROR",
            "amenities": ["ERROR"]
        })

# ========== 5. Save to DataFrame ==========
df_trip_structured = pd.DataFrame(results)



🔹 Hotel Block 0 - Raw AI Output:
```json
{
  "hotel_name": "N/A",
  "distance_miles": "N/A",
  "guest_rating_out_of_10": "N/A",
  "number_of_reviews": "N/A",
  "price_usd_total": "N/A",
  "room_type_or_description": "N/A",
  "amenities": []
}
```


🔹 Hotel Block 1 - Raw AI Output:
```json
[]
```


🔹 Hotel Block 2 - Raw AI Output:
```json
{
  "hotel_name": "Steung Siemreap Hotel",
  "distance_miles": "0.066",
  "guest_rating_out_of_10": "8.8",
  "number_of_reviews": "55",
  "price_usd_total": "N/A",
  "room_type_or_description": "N/A",
  "amenities": []
}
```


🔹 Hotel Block 3 - Raw AI Output:
```json
{
  "hotel_name": "Neth Socheata Hotel",
  "distance_miles": "0.076",
  "guest_rating_out_of_10": "8.4",
  "number_of_reviews": "29",
  "price_usd_total": "49",
  "room_type_or_description": "Deluxe Twin Room",
  "amenities": ["Free Cancellation", "Breakfast included"]
}
```


🔹 Hotel Block 4 - Raw AI Output:
```json
{
  "hotel_name": "The Atelier Hotel",
  "distance_miles": "0.085",
  "g

In [5]:
df_trip_structured = df_trip_structured[df_trip_structured["price_usd_total"] != "N/A"].reset_index(drop=True)
df_trip_structured

Unnamed: 0,hotel_name,distance_miles,guest_rating_out_of_10,number_of_reviews,price_usd_total,room_type_or_description,amenities
0,Neth Socheata Hotel,0.076,8.4,29,49,Deluxe Twin Room,"[Free Cancellation, Breakfast included]"
1,The Atelier Hotel,0.085,4.1,12,34,Deluxe Family Room,"[Free Cancellation, Special Discount]"
2,Siem Reap City Angkor Boutique,,4.1,50,38,Deluxe King Room With Cantilevered Balcony,[Free Cancellation]
3,Shadow Angkor Residence,0.104,9.0,32,50,Superior Double Room,[Free Cancellation]
4,Shalima Guesthouse,0.11,7.0,1,39,Deluxe Room with River View,[Free Cancellation]
...,...,...,...,...,...,...,...
314,Starry Angkor Hotel,1.1,4.0,6,71,One Bedroom Villa King Size Bed Balcony Pool View,"[Free Cancellation, Early Bird Deal]"
315,Elysium Suite,1.2,4.7,108,30,Superior Double Or Twin Room With City View,[Free Cancellation]
316,Angkor Rithy Residence,1.2,4.0,49,35,Cabana Room with Pool Access,"[Free Cancellation, Breakfast included]"
317,Saem Siemreap Hotel,1.2,4.7,116,26,Soupier Room With Hot Tub,"[Free Cancellation, First Booking Deal]"


In [6]:
df_trip_structured.to_excel('Trip.com_combodia.xlsx', index = False)