In [6]:
import re
import pandas as pd
from collections import OrderedDict

# complete huge smaple's extraction

In [7]:
def extract_hotel_info(file_content):
    hotels = []
    current_hotel = OrderedDict()
    facility_keywords = {'Pool', 'Kitchen', 'Washer and dryer', 'Hot tub', 'Spa', 'Gym', 'Restaurant', 'Bar'}
    
    lines = file_content.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.startswith("Photo gallery for"):
            if current_hotel:
                hotels.append(current_hotel)
            current_hotel = OrderedDict([
                ('name', line.replace("Photo gallery for", "").strip()),
                ('rating', None),
                ('reviews', None),
                ('nightly_price', None),
                ('total_price', None),
                ('breakfast', False),
                ('facilities', [])
            ])
            continue

        if not current_hotel:
            continue
        rating_match = re.search(r'(\d+\.?\d*)/10|(\d+\.?\d*)\s+out of 10', line)
        if rating_match:
            current_hotel['rating'] = rating_match.group(1) or rating_match.group(2)


        if 'reviews' in line.lower():
            reviews_match = re.search(r'(\d+,?\d*)', line)
            if reviews_match: 
                current_hotel['reviews'] = reviews_match.group(1).replace(',', '')

        price_match = re.search(r'\$([\d,]+)\s+(nightly|total)', line)
        if price_match:
            price_type = f"{price_match.group(2)}_price"
            current_hotel[price_type] = float(price_match.group(1).replace(',', ''))

        if 'breakfast included' in line.lower():
            current_hotel['breakfast'] = True

        for facility in facility_keywords:
            if facility.lower() in line.lower() and facility not in current_hotel['facilities']:
                current_hotel['facilities'].append(facility)

        if line.startswith("Photo gallery for") and current_hotel:
            hotels.append(current_hotel)
            current_hotel = OrderedDict()

    if current_hotel:
        hotels.append(current_hotel)
    
    for hotel in hotels:
        hotel['facilities'] = '; '.join(hotel['facilities']) if hotel['facilities'] else 'N/A'
    
    return hotels



In [None]:
# example
with open('Siem Reap, Siem Reap, Cambodia Hotel Search Results.txt', 'r', encoding='utf-8') as f:
    file_content = f.read()

hotels = extract_hotel_info(file_content)
df = pd.DataFrame(hotels).fillna('N/A')

df = df.reindex(columns=[
    'name', 'rating', 'reviews', 
    'nightly_price', 'total_price',
    'breakfast', 'facilities'
])

print(f"Extracted {len(hotels)} hotels' information")
df


In [14]:
df.to_excel("Expedia_Combodia_hotels.xlsx",sheet_name = 'Expedia')