In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

# Initialize lists for 6 features
hotel_name = []
hotel_price = []
hotel_rating = []
hotel_location = []
hotel_amenities = []
hotel_category = []
city_name = []

# List of cities to scrape
cities = ['hyderabad', 'delhi', 'mumbai', 'goa', 'kolkata', 'chennai', 'bengaluru']

# Headers to mimic browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
}

total_time = time.time()

for city in cities:
    start_time = time.time()
    page_num = 1
    hotel_count = 0

    while True:
        URL = f"https://www.oyorooms.com/hotels-in-{city}/?page={page_num}"
        print(f"Scraping {city.title()} - Page {page_num}")

        try:
            page = requests.get(URL, headers=headers, timeout=10)
            page.raise_for_status()
            soup = BeautifulSoup(page.text, "html.parser")

            # Find all hotel cards
            hotels = soup.find_all('div', attrs={'class': 'hotelCardListing'})
            if not hotels:
                hotels = soup.find_all('div', attrs={'data-testid': 'hotel-card'})
            if not hotels:
                hotels = soup.find_all('div', attrs={'class': 'oyo-row oyo-row--no-spacing listingHotelDescription'})

            # Stop if no hotels found on current page
            if not hotels:
                print(f"No more hotels found for {city.title()} at page {page_num}. Moving to next city.")
                break

            for hotel in hotels:
                # Feature 1: Hotel Name
                hname = hotel.find('h3', attrs={'class': 'listingHotelDescription__hotelName'})
                if not hname:
                    hname = hotel.find('h2')
                if not hname:
                    hname = hotel.find('div', attrs={'class': 'd-textEllipsis'})

                # Feature 2: Price
                hprice = hotel.find('span', attrs={'class': 'listingPrice'})
                if not hprice:
                    hprice = hotel.find('span', string=lambda x: x and '₹' in str(x))

                # Feature 3: Rating
                hrating = hotel.find('span', attrs={'class': 'hotelRating'})
                if not hrating:
                    hrating = hotel.find('span', attrs={'class': 'is-fontBold'})
                if not hrating:
                    hrating = hotel.find('div', string=lambda x: x and '.' in str(x) and len(str(x)) < 5)

                # Feature 4: Location
                hlocation = hotel.find('span', attrs={'class': 'hotelLocation'})
                if not hlocation:
                    hlocation = hotel.find('p', attrs={'class': 'listingHotelDescription__hotelLocation'})
                if not hlocation:
                    hlocation = hotel.find('span', attrs={'class': 'u-line--clamp-2'})

                # Feature 5: Amenities
                hamenities = hotel.find_all('span', attrs={'class': 'amenity'})
                if not hamenities:
                    hamenities = hotel.find_all('div', attrs={'class': 'amenityWrapper'})
                amenities_text = ', '.join([a.text.strip() for a in hamenities[:3]]) if hamenities else None

                # Feature 6: Category/Type
                hcategory = hotel.find('span', attrs={'class': 'hotelType'})
                if not hcategory:
                    hcategory = hotel.find('div', attrs={'class': 'd-body-sm'})
                if not hcategory:
                    hcategory = hotel.find('span', attrs={'class': 'listingHotelDescription__badge'})

                # Append data to lists
                hotel_name.append(hname.text.strip() if hname else None)
                hotel_price.append(hprice.text.strip() if hprice else None)
                hotel_rating.append(hrating.text.strip() if hrating else None)
                hotel_location.append(hlocation.text.strip() if hlocation else None)
                hotel_amenities.append(amenities_text if amenities_text else None)
                hotel_category.append(hcategory.text.strip() if hcategory else None)
                city_name.append(city.title())

                hotel_count += 1

            page_num += 1  # Move to next page
            time.sleep(1)  # fixed delay

        except requests.exceptions.RequestException as e:
            print(f"Error scraping {city}: {e}")
            break

    print(f'{city.title()} completed with {hotel_count} hotels in {time.time()-start_time:.2f} seconds')

print("Total Time Completed in seconds", str(time.time()-total_time))

# Create DataFrame
df = pd.DataFrame({
    'City': city_name,
    'Hotel_Name': hotel_name,
    'Price': hotel_price,
    'Rating': hotel_rating,
    'Location': hotel_location,
    'Amenities': hotel_amenities,
    'Category': hotel_category
})

Scraping Hyderabad - Page 1
Scraping Hyderabad - Page 2
Scraping Hyderabad - Page 3
Scraping Hyderabad - Page 4
Scraping Hyderabad - Page 5
Scraping Hyderabad - Page 6
Scraping Hyderabad - Page 7
Scraping Hyderabad - Page 8
Scraping Hyderabad - Page 9
Scraping Hyderabad - Page 10
Scraping Hyderabad - Page 11
Scraping Hyderabad - Page 12
Scraping Hyderabad - Page 13
Scraping Hyderabad - Page 14
Scraping Hyderabad - Page 15
Scraping Hyderabad - Page 16
Scraping Hyderabad - Page 17
Scraping Hyderabad - Page 18
Scraping Hyderabad - Page 19
Scraping Hyderabad - Page 20
Scraping Hyderabad - Page 21
Scraping Hyderabad - Page 22
Scraping Hyderabad - Page 23
Scraping Hyderabad - Page 24
Scraping Hyderabad - Page 25
Scraping Hyderabad - Page 26
Scraping Hyderabad - Page 27
Scraping Hyderabad - Page 28
Scraping Hyderabad - Page 29
No more hotels found for Hyderabad at page 29. Moving to next city.
Hyderabad completed with 550 hotels in 149.88 seconds
Scraping Delhi - Page 1
Scraping Delhi - Page 

In [3]:
df = pd.DataFrame({
    'City': city_name,
    'Hotel_Name': hotel_name,
    'Price': hotel_price,
    'Rating': hotel_rating,
    'Location': hotel_location,
    'Amenities': hotel_amenities,
    'Category': hotel_category
})

In [4]:
df.shape

(1604, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1604 entries, 0 to 1603
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City        1604 non-null   object
 1   Hotel_Name  1604 non-null   object
 2   Price       1334 non-null   object
 3   Rating      1604 non-null   object
 4   Location    1604 non-null   object
 5   Amenities   1599 non-null   object
 6   Category    0 non-null      object
dtypes: object(7)
memory usage: 87.8+ KB


In [6]:
df.isnull().sum()

City             0
Hotel_Name       0
Price          270
Rating           0
Location         0
Amenities        5
Category      1604
dtype: int64

In [9]:
df.dtypes

City          object
Hotel_Name    object
Price         object
Rating        object
Location      object
Amenities     object
Category      object
dtype: object

In [13]:
import pandas as pd

df = pd.read_csv('hotel_data.csv')
df

Unnamed: 0,City,Hotel_Name,Price,Rating,Location,Amenities,Category
0,Hyderabad,Super Townhouse Narayanaguda Metro Station For...,₹1894,4.6,"Near Old MLA Quarters, Himayathnagar, Hyderabad",Parking facility Dining area Free Wifi + 14...,
1,Hyderabad,Hotel O A R International,₹760,3.1,"Main Road Shamshabad, Hyderabad",Parking facility Dining area Free Wifi + 11...,
2,Hyderabad,"Super Collection O Regal Near ISKON TEMPLE , A...",₹1427,4.2,"Nampally station road, Adj Lane Metro Foot Wea...",Parking facility Free Wifi Power backup + 1...,
3,Hyderabad,Super Townhouse Kothapet Formerly Surabhi Elite,₹1624,4.6,"Kothapet, Hyderabad",Free Wifi Power backup Parking + 10 more,
4,Hyderabad,Super Townhouse Oak Madhapur Nera Regency,₹1919,4.4,"Madhapur, Hyderabad",Dining area Power backup Parking + 29 more,
...,...,...,...,...,...,...,...
1599,Chennai,Collection O Vadapalani Metro Station Vishnu's...,,NEW,"Kodambakkam Village, Chennai",Free Wifi Geyser Power backup + 4 more,
1600,Chennai,Collection O Kelambakkam Chennai,,NEW,"Chennai, Chennai",Free Wifi Geyser Power backup + 4 more,
1601,Chennai,Collection O MA Chidambaram Stadium Sarvanas Park,,NEW,"C.N. Krishnasawamy Raod, Chennai",Free Wifi Geyser Power backup + 4 more,
1602,Chennai,Hotel O Sithalapakkam Medavakkam,,NEW,"Sithalapakkam, Chennai",Free Wifi Geyser Power backup + 2 more,
