In [5]:
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
def data_to_df(data):
    soup = BeautifulSoup(data, 'lxml')
    df = pd.DataFrame()
    failed = 0
    for listing in soup.find_all('div', {"class":"lst-dtls"}):
        try:
            new_row = {'title': listing.find('a', {"class":"lst-title"}).text,
                       'location': listing.find('div', {"class":"lst-loct"}).text,
                       'price': listing.find('span', {"class":"lst-price"}).text}
            more_cols = [t.text for t in listing.find_all('div', {'class':'lst-sub-title'})]
            more_vals = [v.text for v in listing.find_all('div', {'class':'lst-sub-value'})]
            new_row.update({col:val for col, val in zip(more_cols, more_vals)})
            df = df.append(new_row, ignore_index=True)
        except:
            failed +=1
    print(f'{len(df)-failed} rows converted successfully\n({failed} listings could not be converted)')
    return df

---
# Mumbai
### data from housing.com:

Housing.com Mumbai scrape sorted by **all**:

In [10]:
with open("scrapes/housing.com_all.txt", 'r') as data:
    data = data.read()
mumbai_by_all = data_to_df(data)
mumbai_by_all.head()

6634 rows converted successfully
(7 listings could not be converted)


Unnamed: 0,Avg. Rate,Configs,Possession Date,location,price,title,Avg. Price,Built Up Area,Possession Status,Plot Area
0,33.48k per sqft.,"1, 1.5, 2, 3 BHK Apartments","Sep, 2020","Mazgaon, Mumbai",1.18 Cr - 2.37 Cr,Alfa Mana A M Residency,,,,
1,,3 BHK Apartment,"Dec, 2022","Charni Road, Girgaon, Mumbai",6.5 Cr - 6.8 Cr,Sanghvi Parsssva ExcellenSea,,,,
2,42.37k per sqft.,"2, 3 BHK Apartments","Jan, 2025","Byculla, Mumbai",3.36 Cr - 4.95 Cr,Piramal Aranya Arav Tower,,,,
3,,,,"Bhandup West, Mumbai",87.5 L,2 BHK Apartment,11.09k / sqft.,789 sqft.,,
4,,"1, 2, 3 BHK Apartments","Mar, 2022","Kanjurmarg East, Mumbai",1.05 Cr - 2.53 Cr,Kanakia Zenworld Phase I,,,,


Housing.com Mumbai scrape sorted by **area:**

In [4]:
with open("../autosaves/housing.com_area.txt", 'r') as data:
    data = data.read()
mumbai_by_area = data_to_df(data)
mumbai_by_area.head()

13467 rows converted successfully
(3 listings could not be converted)


Unnamed: 0,Avg. Price,Plot Area,location,price,title,Built Up Area,Possession Date,Possession Status,Avg. Rate,Configs
0,22.0 / sqft.,1134380 sqft.,"Kandivali West, Mumbai",2.6 Cr,Residential Plot,,,,,
1,87.0 / sqft.,,"Powai, Mumbai",24.0k,1 RK Apartment,275 sqft.,,,,
2,100.0 / sqft.,,"Mahim, Mumbai",20.0k,1 BHK Apartment,200 sqft.,,,,
3,125.0 / sqft.,,"Chembur, Mumbai",1.2 L,2 BHK Apartment,960 sqft.,"30th Jan, 2021",,,
4,153.0 / sqft.,,"Kandivali East, Mumbai",1.67 L,2 BHK Apartment,1090 sqft.,,,,


Housing.com Mumbai scrape sorted by **new:**

In [5]:
with open("../autosaves/housing.com_new.txt", 'r') as data:
    data = data.read()
mumbai_by_new = data_to_df(data)
mumbai_by_new.head()

6974 rows converted successfully
(1 listings could not be converted)


Unnamed: 0,Avg. Price,Built Up Area,Possession Date,location,price,title,Possession Status,Configs,Plot Area,Avg. Rate
0,13.44k / sqft.,521 sqft.,"19th Aug, 2020","Andheri East, Mumbai",70.0 L,1 BHK Apartment,,,,
1,9.23k / sqft.,390 sqft.,"30th Dec, 2022","Kandivali East, Mumbai",36.0 L,1 BHK Apartment,,,,
2,15.06k / sqft.,332 sqft.,,"Sion, Mumbai",50.0 L,1 RK Apartment,,,,
3,14.29k / sqft.,350 sqft.,,"Sion, Mumbai",50.0 L,1 RK Apartment,,,,
4,10.0k / sqft.,350 sqft.,,"Kandivali East, Mumbai",35.0 L,1 BHK Apartment,,,,


In [9]:
df_combined = pd.concat([mumbai_by_all, mumbai_by_area, mumbai_by_new], sort=False)
print(df_combined.shape)
print(f'(-{sum(df_combined.duplicated())}) duplicates',)
df_combined.drop_duplicates(inplace=True)
print(df_combined.shape)
df_combined.head()

(27088, 10)
(-9089) duplicates
(17999, 10)


Unnamed: 0,Avg. Rate,Configs,Possession Date,location,price,title,Avg. Price,Built Up Area,Possession Status,Plot Area
0,33.48k per sqft.,"1, 1.5, 2, 3 BHK Apartments","Sep, 2020","Mazgaon, Mumbai",1.18 Cr - 2.37 Cr,Alfa Mana A M Residency,,,,
1,,3 BHK Apartment,"Dec, 2022","Charni Road, Girgaon, Mumbai",6.5 Cr - 6.8 Cr,Sanghvi Parsssva ExcellenSea,,,,
2,42.37k per sqft.,"2, 3 BHK Apartments","Jan, 2025","Byculla, Mumbai",3.36 Cr - 4.95 Cr,Piramal Aranya Arav Tower,,,,
3,,,,"Bhandup West, Mumbai",87.5 L,2 BHK Apartment,11.09k / sqft.,789 sqft.,,
4,,"1, 2, 3 BHK Apartments","Mar, 2022","Kanjurmarg East, Mumbai",1.05 Cr - 2.53 Cr,Kanakia Zenworld Phase I,,,,


In [11]:
df_combined.to_csv('housing.com_data.csv', index=False)