In [1]:
import pandas as pd
import pathlib


In [15]:
DATA_DIR = pathlib.Path("./data/")

In [17]:
# Read CSV "Real Estate Data V21.csv" into pandas make sure path is a pathlib path
# data\raw\Real Estate Data V21.csv
data_path = DATA_DIR / "raw" / "Real Estate Data V21.csv"
raw_df = pd.read_csv(data_path)
df = raw_df.copy()
raw_df.head()


Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [18]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14528 non-null  object 
 1   Property Title  14528 non-null  object 
 2   Price           14528 non-null  object 
 3   Location        14528 non-null  object 
 4   Total_Area      14528 non-null  int64  
 5   Price_per_SQFT  14528 non-null  float64
 6   Description     14528 non-null  object 
 7   Baths           14528 non-null  int64  
 8   Balcony         14528 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1021.6+ KB


In [19]:
df.shape

(14528, 9)

In [20]:
df.isnull().sum()

Name              0
Property Title    0
Price             0
Location          0
Total_Area        0
Price_per_SQFT    0
Description       0
Baths             0
Balcony           0
dtype: int64

In [21]:
df["hasBalcony"] = raw_df["Balcony"].map({
    "Yes":True,
    "No":False
})

df["city"] = raw_df["Location"].str.split(",").str[-1].str.strip()
df["suburb"] = raw_df["Location"].str.split(",").str[0].str.strip()

In [22]:
prices = raw_df["Price"].str.replace(r"₹([0-9].[0-9]$)", r"₹\1 Cr", regex=True)
prices = prices.str.replace("Lacs", "L")
prices = prices.str.replace("k", " K")

def parse_price(price: str) -> int:
    try:
        price = price.replace('₹', '')
        val, suf = price.split(' ')
        if suf == 'Cr':
            val = float(val) * 1_00_00_000
        elif suf == 'L':
            val = float(val) * 1_00_000
        elif suf == 'K':
            val = float(val) * 1_000
        return int(val)
    except:
        return None
    

prices = prices.map(parse_price)


In [23]:
df["price_parsed"] = prices

In [24]:
p_type = raw_df["Property Title"].str.extract(r"([0-9]+)+\s+(BHK|RK|BH|R)\s+(.*?)\s+for sale")
p_type.columns = ["Bed Rooms", "Room Type", "Property Type"]

p_type["Bed Rooms"] = p_type["Bed Rooms"].fillna(1) # There has to be 1 Room at least
p_type["Property Type"] = p_type["Property Type"].fillna("Studio") # These are studio flats so no rooms
p_type["Room Type"] = p_type["Room Type"].fillna("R") # These are studio flats"

p_type

Unnamed: 0,Bed Rooms,Room Type,Property Type
0,4,BHK,Flat
1,10,BHK,Independent House
2,3,BHK,Flat
3,7,BHK,Independent House
4,2,BHK,Flat
...,...,...,...
14523,2,BHK,Flat
14524,1,BHK,Independent House
14525,1,BHK,Flat
14526,3,BHK,Flat


In [25]:
final = pd.concat([df, p_type], axis=1)
final

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,hasBalcony,city,suburb,price_parsed,Bed Rooms,Room Type,Property Type
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,True,Chennai,Kanathur Reddikuppam,19900000,4,BHK,Flat
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,True,Chennai,Ramanathan Nagar,22500000,10,BHK,Independent House
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,False,Chennai,Kasthuribai Nagar,10000000,3,BHK,Flat
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,True,Chennai,Naveenilaya,33300000,7,BHK,Independent House
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,True,Chennai,Avadi,4800000,2,BHK,Flat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14523,"Krishna Park Extension, Tilak Nagar,New Delhi","2 BHK Flat for sale in Tilak Nagar, New Delhi",₹40.0 L,"Krishna Park Extension, Tilak Nagar,New Delhi",1053,3800.0,Looking for a good 2 BHK Apartment in Tilak Na...,3,Yes,True,New Delhi,Krishna Park Extension,4000000,2,BHK,Flat
14524,"Rawta, Jaffarpur Kalan,New Delhi",1 BHK Independent House for sale in Jaffarpur ...,₹14.0 L,"Rawta, Jaffarpur Kalan,New Delhi",472,2970.0,1 BHK Independent House for sale in Jaffarpur ...,2,Yes,True,New Delhi,Rawta,1400000,1,BHK,Independent House
14525,"Rani Garden, Geeta Colony,New Delhi","1 BHK Flat for sale in Geeta Colony, New Delhi",₹30.0 L,"Rani Garden, Geeta Colony,New Delhi",378,7940.0,"Property for sale in Geeta Colony, Delhi. This...",1,Yes,True,New Delhi,Rani Garden,3000000,1,BHK,Flat
14526,"Lig flat rohini,Sector 16E, Sector 16 Rohini,N...","3 BHK Flat for sale in Sector 16 Rohini, New D...",₹60.0 L,"Lig flat rohini,Sector 16E, Sector 16 Rohini,N...",700,8570.0,Check out this 3 BHK Apartment for sale in Roh...,2,Yes,True,New Delhi,Lig flat rohini,6000000,3,BHK,Flat


In [26]:
final.drop([
        "Property Title",
        "Price",
        "Location",
        "Balcony"
], axis=1, inplace=True)



In [27]:
final = final.astype(
    {
        "city": "category",
        "suburb": "category",
        "Bed Rooms": "int8",
        "Baths" : "int8",
        "Room Type":  "category",
        "Property Type": "category",
    }
)

final.dtypes

Name                object
Total_Area           int64
Price_per_SQFT     float64
Description         object
Baths                 int8
hasBalcony            bool
city              category
suburb            category
price_parsed         int64
Bed Rooms             int8
Room Type         category
Property Type     category
dtype: object

In [28]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Name            14528 non-null  object  
 1   Total_Area      14528 non-null  int64   
 2   Price_per_SQFT  14528 non-null  float64 
 3   Description     14528 non-null  object  
 4   Baths           14528 non-null  int8    
 5   hasBalcony      14528 non-null  bool    
 6   city            14528 non-null  category
 7   suburb          14528 non-null  category
 8   price_parsed    14528 non-null  int64   
 9   Bed Rooms       14528 non-null  int8    
 10  Room Type       14528 non-null  category
 11  Property Type   14528 non-null  category
dtypes: bool(1), category(4), float64(1), int64(2), int8(2), object(2)
memory usage: 990.9+ KB


In [30]:
final.to_excel(DATA_DIR / "clean" / "Real_Estate_Cleaned.xlsx")