## Claude

In [5]:
import re
import pandas as pd

def extract_hotel_info(text):
    
    hotel_blocks = re.findall(r'(.*?Opens in new window.*?See availability|.*?Opens in new window.*?Price \$\d+)', text, re.DOTALL)
    
    hotels = []
    for block in hotel_blocks:
        hotel = {
            'Name': 'N/A',
            'Rating': 'N/A',
            'Price': 'N/A',
            'Reviews': 'N/A',
            'Services': [],
            'Distance': 'N/A',
            'Room Type': 'N/A',
            'Remaining Rooms': 'N/A'
        }
        
        
        name_match = re.search(r'([A-Za-z0-9\s\-\',&]+)\s+Opens in new window', block)
        if name_match:
            hotel['Name'] = name_match.group(1).strip()
        
        
        price_match = re.search(r'\$(\d+)(?:\$(\d+))?', block)
        if price_match and price_match.group(2):  
            hotel['Price'] = int(price_match.group(2))
        elif price_match:  
            hotel['Price'] = int(price_match.group(1))
        
        rating_match = re.search(r'Scored\s+([\d\.]+)', block)
        if rating_match:
            hotel['Rating'] = float(rating_match.group(1))
        
        reviews_match = re.search(r'([\d,]+)\s+reviews', block)
        if reviews_match:
            hotel['Reviews'] = int(reviews_match.group(1).replace(',', ''))
        
        if re.search(r'[Ff]ree\s+cancellation', block):
            hotel['Services'].append('Free cancellation')
        if re.search(r'[Bb]reakfast\s+included', block):
            hotel['Services'].append('Breakfast included')
        if re.search(r'[Nn]o\s+prepayment', block):
            hotel['Services'].append('No prepayment needed')
        
        distance_match = re.search(r'([\d\.]+)\s+miles\s+from\s+downtown', block)
        if distance_match:
            hotel['Distance'] = f"{distance_match.group(1)} miles"
        
        room_type_match = re.search(r'([A-Za-z]+\s+[A-Za-z]+\s+Room|[A-Za-z]+\s+Suite|Studio)', block)
        if room_type_match:
            hotel['Room Type'] = room_type_match.group(1)
        
        rooms_left_match = re.search(r'Only\s+(\d+)\s+rooms?\s+left', block)
        if rooms_left_match:
            hotel['Remaining Rooms'] = int(rooms_left_match.group(1))
        
        hotel['Services'] = ', '.join(hotel['Services']) if hotel['Services'] else 'N/A'
        
        if hotel['Name'] != 'N/A':
            hotels.append(hotel)
    
    return hotels




In [7]:

with open('Booking.com_ Hotels in Siem Reap. Book your hotel now!.txt', 'r', encoding='utf-8') as f:
    text = f.read()

hotels = extract_hotel_info(text)


print(f"Found {len(hotels)} hotels")


df = pd.DataFrame(hotels)
df.to_excel('hotels_Indonesia_Booking.xlsx', sheet_name='Booking',index=False, 
           columns=['Name', 'Rating', 'Price', 'Reviews', 'Services', 
                    'Distance', 'Room Type', 'Remaining Rooms'])

print("The hotel information has been successfully exported")

找到 520 家酒店
酒店信息已成功导出


In [8]:
df = pd.DataFrame(hotels)
df.to_excel('hotels_Cambodia_Booking.xlsx', sheet_name='Booking',index=False, 
           columns=['Name', 'Rating', 'Price', 'Reviews', 'Services', 
                    'Distance', 'Room Type', 'Remaining Rooms'])