In [1]:
import re
import pandas as pd
from io import StringIO

## Claude

In [14]:
with open('Siem Reap Hotels - Where to stay in Siem Reap _ Trip.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [15]:
# Split the text by hotel entries
# Each entry begins with an "*" and contains"hotel overview picture"
hotel_blocks = re.findall(r'\*\s+hotel overview picture[\s\S]+?(?=\s+\*|\s+$)', text)

hotels_data = []

for block in hotel_blocks:
    hotel = {}
    
    # Extract hotel names - Improved extraction method
    name_match = re.search(r'hotel overview picture[\s\S]+?([^<>\r\n]+)\s+<https', block)
    if name_match:
        hotel['name'] = name_match.group(1).strip()
    
    # Extract the score
    rating_match = re.search(r'(\d+\.\d+)/5', block)
    if rating_match:
        hotel['rating'] = float(rating_match.group(1))
    else:
        hotel['rating'] = None
    
    # Extract the evaluation type
    review_type_match = re.search(r'/5\s+\n\s+([A-Za-z ]+)\n', block)
    if review_type_match:
        hotel['review_type'] = review_type_match.group(1).strip()
    else:
        hotel['review_type'] = None
    
    # Extract the number of evaluations
    reviews_match = re.search(r'(\d+,?\d*)\s+reviews', block)
    if reviews_match:
        hotel['reviews'] = reviews_match.group(1).replace(',', '')
    else:
        hotel['reviews'] = None
    
    # Extract location
    location_match = re.search(r'Near\s+([^"]+?)Show on Map', block)
    if location_match:
        hotel['location'] = location_match.group(1).strip()
    else:
        hotel['location'] = None
    
    # Extract the room type
    room_match = re.search(r'Show on Map\s+\n\s+([^/]+?)/{2,}', block)
    if room_match:
        hotel['room_type'] = room_match.group(1).strip()
    else:
        hotel['room_type'] = None
    
    # Extract the original price

    original_price_match = re.search(r'\$(\d+)\s+\$(\d+)', block)
    if original_price_match:
        hotel['original_price'] = int(original_price_match.group(1))
        hotel['price'] = int(original_price_match.group(2))
    else:
        price_match = re.search(r'(?<!Total \(incl\. taxes & fees\): )\$(\d+)\s+\n', block)
        if price_match:
            hotel['original_price'] = None
            hotel['price'] = int(price_match.group(1))
        else:
            hotel['original_price'] = None
            hotel['price'] = None
    
    # Total extraction price (including taxes and fees)

    total_price_match = re.search(r'Total \(incl\. taxes & fees\): \$(\d+)', block)
    if total_price_match:
        hotel['total_price'] = int(total_price_match.group(1))
    else:
        hotel['total_price'] = None
    
    # Extract discount information
    discount_match = re.search(r'(\d+)% Off', block)
    if discount_match:
        hotel['discount'] = f"{discount_match.group(1)}% Off"
    else:
        hotel['discount'] = None
    
    # Extract special labels
    if 'Free Cancellation' in block:
        hotel['free_cancellation'] = True
    else:
        hotel['free_cancellation'] = False
        
    if 'Breakfast included' in block:
        hotel['breakfast_included'] = True
    else:
        hotel['breakfast_included'] = False
    
    hotels_data.append(hotel)

# Create a DataFrame
df = pd.DataFrame(hotels_data)
df


Unnamed: 0,name,rating,review_type,reviews,location,room_type,original_price,price,total_price,discount,free_cancellation,breakfast_included
0,Steung Siemreap Hotel,4.4,Very Good,55,,Deluxe Twin Room,65.0,44.0,49.0,30% Off,True,True
1,Neth Socheata Hotel,4.2,Very Good,29,,Deluxe Family Room,36.0,30.0,34.0,14% Off,True,False
2,The Atelier Hotel,4.1,Very Good,12,,Deluxe King Room With Cantilevered Balcony,57.0,34.0,38.0,37% Off,True,False
3,Siem Reap City Angkor Boutique,4.1,Very Good,50,,Superior Double Room,125.0,45.0,50.0,62% Off,True,False
4,Shadow Angkor Residence,4.5,Great,32,,Deluxe Room with River View,,32.0,39.0,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
315,SANNA VILLA Residence,4.7,Amazing,6,,One Bedroom Villa King Size Bed Balcony Pool View,76.0,64.0,71.0,15% Off,True,False
316,Starry Angkor Hotel,4.0,Very Good,6,,Superior Double Or Twin Room With City View,,27.0,30.0,,True,False
317,Elysium Suite,4.7,Amazing,108,,Cabana Room with Pool Access,,28.0,35.0,,True,True
318,Angkor Rithy Residence,4.0,Very Good,49,,Soupier Room With Hot Tub,35.0,23.0,26.0,33% Off,True,False


In [16]:
df.to_excel("hotels_Cambodia_Trip.xlsx",sheet_name ='Trip.com')