In [1]:
from typing import Callable
import requests
from bs4 import BeautifulSoup, Tag
import pandas as pd
import json

In [2]:
soup = BeautifulSoup(open('1.html'), 'html.parser')

### There are two data present on the webpage:

1. SRP Data
2. Project Data

In [3]:
srp_cards: list[Tag] = soup.find_all(attrs={'class': 'srpTuple__descCont'})
project_cards: list[Tag] = soup.find_all(attrs={'class': 'projectTuple__descCont'})

In [4]:
len(srp_cards), len(project_cards)

(51, 76)

In [5]:
srp = srp_cards[0]
project = project_cards[0]

## Scrape SRP Cards

In [6]:
def make_srp_item_list(
        str_func: Callable[[str], str] | None = None,
        use_find_all: bool = False,
        **kwargs,
) -> list[list[Tag] | str | None]:
    result = []

    for i in srp_cards:
        if use_find_all:
            f = i.find_all(**kwargs)
            result.append(f)
        else:
            f = i.find(**kwargs)
            if f is None:
                result.append(None)
                continue
            if str_func:
                result.append(str_func(f.text))
            else:
                result.append(f.text)

    return result

In [7]:
titles = make_srp_item_list(name='h2')

In [8]:
property_name = make_srp_item_list(attrs={'class': 'srpTuple__propertyPremiumHeading'})

In [9]:
nearby_places = make_srp_item_list(attrs={'class': 'SliderTagsAndChips__sliderChips'})

In [10]:
nearby_places_count = make_srp_item_list(attrs={'class': 'SliderTagsAndChips__nearByInfo'})

In [11]:
price = make_srp_item_list(lambda x: x.split('\n', 1)[0], attrs={'id': 'srp_tuple_price'})

In [12]:
price_per_unit = make_srp_item_list(attrs={'id': 'srp_tuple_price_per_unit_area'})

In [13]:
area = make_srp_item_list(lambda x: x.split('\n', 1)[0], attrs={'id': 'srp_tuple_primary_area'})

In [14]:
area_per_unit = make_srp_item_list(attrs={'id': 'srp_tuple_secondary_area'})

In [15]:
bhk = make_srp_item_list(lambda x: x.split(' BHK')[0], attrs={'id': 'srp_tuple_bedroom'})

In [16]:
bhk_baths = make_srp_item_list(lambda x: x.split(' Baths')[0], attrs={'id': 'srp_tuple_bathroom'})

In [17]:
desc = make_srp_item_list(lambda x: x.strip(), attrs={'id': 'srp_tuple_description'})

In [18]:
usp_band = make_srp_item_list(lambda x: x.strip(), attrs={'data-label': 'USP_BAND'})

In [19]:
property_badge = make_srp_item_list(lambda x: x.strip(), attrs={'class': 'srpTuple__badgeWrap'})

In [20]:
srp_data = {
    'title': titles,
    'property_name': property_name,
    'property_badge': property_badge,
    'nearby_places': nearby_places,
    'nearby_places_count': nearby_places_count,
    'price': price,
    'price_per_unit': price_per_unit,
    'area': area,
    'area_per_unit': area_per_unit,
    'usp_band': usp_band,
    'desc': desc,
    'bhk': bhk,
    'bhk_baths': bhk_baths,
}

In [21]:
[len(val) for val in srp_data.values()]

[51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51]

In [22]:
srp_df = pd.DataFrame(srp_data)
print(srp_df.shape)
srp_df.head()

(51, 13)


Unnamed: 0,title,property_name,property_badge,nearby_places,nearby_places_count,price,price_per_unit,area,area_per_unit,usp_band,desc,bhk,bhk_baths
0,4 BHK\n ...,\nDLF Garden City Floors,FEATURED\n\n\n\n ...,\n\n1\n ...,\n\n ...,₹ 1.9 Cr,"₹ 7,037/sq.ft.\n ...","2,700 sq.ft.",(251 sq.m.) Carpet\n ...,East\n ...,4bhk\n ...,4.0,4.0
1,2 BHK\n ...,\n ...,FEATURED\n\n\n\n ...,\n\n1\n ...,\n\n ...,"₹ 61.91 L₹ 6,253/sq.ft.","₹ 6,253/sq.ft.\n ...",990 sq.ft.,(92 sq.m.) Carpet Area\n ...,Gated\n ...,Move\n ...,2.0,2.0
2,4 BHK\n ...,\nDLF Garden City Floors,UNDER CONSTRUCTION\n\n\n\n ...,,,₹ 2.7 Cr,"₹ 13,989/sq.ft.\n ...","2,885 sq.ft.",(268 sq.m.) Super\n ...,,Here\n ...,4.0,4.0
3,3 BHK Flat\n ...,\nSS Almeria,READY TO MOVE\n\n\n\n ...,,,₹ 1.48 Cr,"₹ 7,400/sq.ft.\n ...","2,000 sq.ft.",(186 sq.m.) Super\n ...,Recently\n ...,This\n ...,3.0,3.0
4,Residential\n ...,\n ...,IN GATED COMMUNITY\n\n\n\n ...,,,₹ 4.95 Cr,"₹ 18,333/sq.ft.\n ...","2,700 sq.ft.",(251 sq.m.) Plot Area\n ...,,Residential\n ...,,


In [23]:
srp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                51 non-null     object
 1   property_name        51 non-null     object
 2   property_badge       51 non-null     object
 3   nearby_places        2 non-null      object
 4   nearby_places_count  2 non-null      object
 5   price                51 non-null     object
 6   price_per_unit       51 non-null     object
 7   area                 51 non-null     object
 8   area_per_unit        51 non-null     object
 9   usp_band             36 non-null     object
 10  desc                 51 non-null     object
 11  bhk                  51 non-null     object
 12  bhk_baths            30 non-null     object
dtypes: object(13)
memory usage: 5.3+ KB


In [25]:
srp_df['title'][0]

'4 BHK\n                                                                        Independent Builder Floor in Sector 92 Gurgaon\n                                                                    '

## Scrape Project Cards