# Pre-processing

In [22]:
import pandas as pd
import numpy as np
import logging
from pandas.errors import ParserError, EmptyDataError


In [19]:
filepath = '../../datasets/raw/hyderabad_house_price_original.csv'
raw_data = pd.read_csv(filepath)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518 entries, 0 to 2517
Data columns (total 40 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Price                2518 non-null   int64 
 1   Area                 2518 non-null   int64 
 2   Location             2518 non-null   object
 3   No. of Bedrooms      2518 non-null   int64 
 4   Resale               2518 non-null   int64 
 5   MaintenanceStaff     2518 non-null   int64 
 6   Gymnasium            2518 non-null   int64 
 7   SwimmingPool         2518 non-null   int64 
 8   LandscapedGardens    2518 non-null   int64 
 9   JoggingTrack         2518 non-null   int64 
 10  RainWaterHarvesting  2518 non-null   int64 
 11  IndoorGames          2518 non-null   int64 
 12  ShoppingMall         2518 non-null   int64 
 13  Intercom             2518 non-null   int64 
 14  SportsFacility       2518 non-null   int64 
 15  ATM                  2518 non-null   int64 
 16  ClubHo

In [26]:
class ColumnInfo:
    def __init__(self, name, desc, dtype, format_str=None):
        """
        Initializes the feature information.
        
        Args:
            name (str): The name of the column.
            desc (str): The description of the feature.
            dtype (type): The expected data type (e.g., np.int64, str).
            format_str (str, optional): The format string for display.
        """
        self.name = name
        self.desc = desc
        self.dtype = dtype
        self.format_str = format_str

class Column:
    PRICE = ColumnInfo("Price", "The price of the house.", np.int64, format_str='$ {:,.0f}')
    AREA = ColumnInfo("Area", "The area of the property in square feet.", np.int64, format_str='{:,.0f}')
    LOCATION = ColumnInfo("Location", "The neighborhood in Hyderabad.", str)
    NO_OF_BEDROOMS = ColumnInfo("No. of Bedrooms", "The number of bedrooms.", np.int64, format_str='{:.0f}')
    RESALE = ColumnInfo("Resale", "A binary flag indicating if the property is for resale.", np.int64, format_str='{:.0f}')
    MAINTENANCE_STAFF = ColumnInfo("MaintenanceStaff", "A flag for the availability of maintenance staff.", np.int64, format_str='{:.0f}')
    GYMNASIUM = ColumnInfo("Gymnasium", "A flag for the availability of a gymnasium.", np.int64, format_str='{:.0f}')
    SWIMMING_POOL = ColumnInfo("SwimmingPool", "A flag for the availability of a swimming pool.", np.int64, format_str='{:.0f}')
    LANDSCAPED_GARDENS = ColumnInfo("LandscapedGardens", "A flag for the availability of landscaped gardens.", np.int64, format_str='{:.0f}')
    JOGGING_TRACK = ColumnInfo("JoggingTrack", "A flag for the availability of a jogging track.", np.int64, format_str='{:.0f}')
    RAIN_WATER_HARVESTING = ColumnInfo("RainWaterHarvesting", "A flag for the availability of rainwater harvesting.", np.int64, format_str='{:.0f}')
    INDOOR_GAMES = ColumnInfo("IndoorGames", "A flag for the availability of indoor games facilities.", np.int64, format_str='{:.0f}')
    SHOPPING_MALL = ColumnInfo("ShoppingMall", "A flag for the availability of a nearby shopping mall.", np.int64, format_str='{:.0f}')
    INTERCOM = ColumnInfo("Intercom", "A flag for the availability of an intercom facility.", np.int64, format_str='{:.0f}')
    SPORTS_FACILITY = ColumnInfo("SportsFacility", "A flag for the availability of a sports facility.", np.int64, format_str='{:.0f}')
    ATM = ColumnInfo("ATM", "A flag for the availability of a nearby ATM.", np.int64, format_str='{:.0f}')
    CLUB_HOUSE = ColumnInfo("ClubHouse", "A flag for the availability of a club house.", np.int64, format_str='{:.0f}')
    SCHOOL = ColumnInfo("School", "A flag for the availability of a nearby school.", np.int64, format_str='{:.0f}')
    SECURITY_24X7 = ColumnInfo("24X7Security", "A flag for the availability of 24x7 security.", np.int64, format_str='{:.0f}')
    POWER_BACKUP = ColumnInfo("PowerBackup", "A flag for the availability of power backup.", np.int64, format_str='{:.0f}')
    CAR_PARKING = ColumnInfo("CarParking", "A flag for the availability of car parking.", np.int64, format_str='{:.0f}')
    STAFF_QUARTER = ColumnInfo("StaffQuarter", "A flag for the availability of a staff quarter.", np.int64, format_str='{:.0f}')
    CAFETERIA = ColumnInfo("Cafeteria", "A flag for the availability of a cafeteria.", np.int64, format_str='{:.0f}')
    MULTIPURPOSE_ROOM = ColumnInfo("MultipurposeRoom", "A flag for the availability of a multipurpose room.", np.int64, format_str='{:.0f}')
    HOSPITAL = ColumnInfo("Hospital", "A flag for the availability of a nearby hospital.", np.int64, format_str='{:.0f}')
    WASHING_MACHINE = ColumnInfo("WashingMachine", "A flag indicating if a washing machine is included.", np.int64, format_str='{:.0f}')
    GAS_CONNECTION = ColumnInfo("Gasconnection", "A flag for the availability of a gas connection.", np.int64, format_str='{:.0f}')
    AC = ColumnInfo("AC", "A flag indicating if air conditioning is included.", np.int64, format_str='{:.0f}')
    WIFI = ColumnInfo("Wifi", "A flag for the availability of WiFi.", np.int64, format_str='{:.0f}')
    CHILDRENS_PLAY_AREA = ColumnInfo("Children'splayarea", "A flag for the availability of a children's play area.", np.int64, format_str='{:.0f}')
    LIFT_AVAILABLE = ColumnInfo("LiftAvailable", "A flag for the availability of a lift.", np.int64, format_str='{:.0f}')
    BED = ColumnInfo("BED", "A flag indicating if a bed is included.", np.int64, format_str='{:.0f}')
    VAASTU_COMPLIANT = ColumnInfo("VaastuCompliant", "A flag indicating if the property is Vaastu compliant.", np.int64, format_str='{:.0f}')
    MICROWAVE = ColumnInfo("Microwave", "A flag indicating if a microwave is included.", np.int64, format_str='{:.0f}')
    GOLF_COURSE = ColumnInfo("GolfCourse", "A flag for the availability of a golf course.", np.int64, format_str='{:.0f}')
    TV = ColumnInfo("TV", "A flag indicating if a TV is included.", np.int64, format_str='{:.0f}')
    DINING_TABLE = ColumnInfo("DiningTable", "A flag indicating if a dining table is included.", np.int64, format_str='{:.0f}')
    SOFA = ColumnInfo("Sofa", "A flag indicating if a sofa is included.", np.int64, format_str='{:.0f}')
    WARDROBE = ColumnInfo("Wardrobe", "A flag indicating if a wardrobe is included.", np.int64, format_str='{:.0f}')
    REFRIGERATOR = ColumnInfo("Refrigerator", "A flag indicating if a refrigerator is included.", np.int64, format_str='{:.0f}')

    @staticmethod
    def get_format_dict():
        """
        Generates a format dictionary from the features defined in the class
        that have a format_str defined.
        """
        return {
            value.name: value.format_str
            for value in Column.__dict__.values()
            if isinstance(value, ColumnInfo) and value.format_str is not None
        }

    @staticmethod
    def get_dtypes_dict():
        """
        Generates a data type dictionary for all features defined in the class.
        """
        return {
            value.name: value.dtype
            for value in Column.__dict__.values()
            if isinstance(value, ColumnInfo)
        }
    

# Checking the expected data for each column
try:
    dataset = pd.read_csv(filepath, dtype=Column.get_dtypes_dict())
except (FileNotFoundError, ValueError, ParserError, EmptyDataError) as e:
    logging.exception(f"Error trying to load the dataset '{filepath}'. Cause: {e}")
    
all_stats = dataset.describe(include='all')
styled_all_stats = all_stats.style.format(Column.get_format_dict(), na_rep="-")

print("### Descriptive Statistics for ALL Columns ###")
display(styled_all_stats)

### Descriptive Statistics for ALL Columns ###


Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,RainWaterHarvesting,IndoorGames,ShoppingMall,Intercom,SportsFacility,ATM,ClubHouse,School,24X7Security,PowerBackup,CarParking,StaffQuarter,Cafeteria,MultipurposeRoom,Hospital,WashingMachine,Gasconnection,AC,Wifi,Children'splayarea,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
count,"$ 2,518",2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518,2518
unique,-,-,243,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
top,-,-,Kukatpally,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
freq,-,-,166,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
mean,"$ 9,818,380",1645,-,3,0,0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0
std,"$ 8,777,113",746,-,1,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
min,"$ 2,000,000",500,-,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25%,"$ 4,760,000",1160,-,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
50%,"$ 7,754,000",1500,-,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
75%,"$ 10,900,000",1829,-,3,0,0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0


In [None]:
import random as rd

# Utilities
class ColumnInfo:
    def __init__(self, name, desc):
        self.name = name
        self.desc = desc

class Column:
    PRICE = ColumnInfo("Price", "The price of the house.")
    AREA = ColumnInfo("Area", "The area of the property, probably in square feet.")
    LOCATION = ColumnInfo("Location", "The location or neighborhood in Hyderabad where the property is located.")
    NO_OF_BEDROOMS = ColumnInfo("No. of Bedrooms", "The number of bedrooms in the house.")
    RESALE = ColumnInfo("Resale", "A binary indicator that probably means 1 if the property is for resale and 0 if it is new.")
    MAINTENANCE_STAFF = ColumnInfo("MaintenanceStaff", "Indicates if maintenance staff is available.")
    GYMNASIUM = ColumnInfo("Gymnasium", "Indicates if the property has a gym.")
    SWIMMING_POOL = ColumnInfo("SwimmingPool", "Indicates if the property has a swimming pool.")
    LANDSCAPED_GARDENS = ColumnInfo("LandscapedGardens", "Indicates if the property has landscaped gardens.")
    JOGGING_TRACK = ColumnInfo("JoggingTrack", "Indicates if the property has a jogging track.")
    RAIN_WATER_HARVESTING = ColumnInfo("RainWaterHarvesting", "Indicates if the property has a rainwater harvesting system.")
    INDOOR_GAMES = ColumnInfo("IndoorGames", "Indicates if the property has facilities for indoor games.")
    SHOPPING_MALL = ColumnInfo("ShoppingMall", "Indicates if there is a shopping mall nearby or within the complex.")
    INTERCOM = ColumnInfo("Intercom", "Indicates if the property has an intercom system.")
    SPORTS_FACILITY = ColumnInfo("SportsFacility", "Indicates if the property has sports facilities.")
    ATM = ColumnInfo("ATM", "Indicates if there is an ATM nearby or within the complex.")
    CLUB_HOUSE = ColumnInfo("ClubHouse", "Indicates if the property has a clubhouse.")
    SCHOOL = ColumnInfo("School", "Indicates if there is a school nearby.")
    SECURITY_24X7 = ColumnInfo("24X7Security", "Indicates if the property has 24/7 security.")
    POWER_BACKUP = ColumnInfo("PowerBackup", "Indicates if the property has a power backup system.")
    CAR_PARKING = ColumnInfo("CarParking", "Indicates if car parking is available.")
    STAFF_QUARTER = ColumnInfo("StaffQuarter", "Indicates if the property has staff quarters.")
    CAFETERIA = ColumnInfo("Cafeteria", "Indicates if the property has a cafeteria.")
    MULTIPURPOSE_ROOM = ColumnInfo("MultipurposeRoom", "Indicates if the property has a multipurpose room.")
    HOSPITAL = ColumnInfo("Hospital", "Indicates if there is a hospital nearby.")
    WASHING_MACHINE = ColumnInfo("WashingMachine", "Indicates if a washing machine is included.")
    GAS_CONNECTION = ColumnInfo("Gasconnection", "Indicates if the property has a gas connection.")
    AC = ColumnInfo("AC", "Indicates if air conditioning is included.")
    WIFI = ColumnInfo("Wifi", "Indicates if Wi-Fi is available.")
    CHILDRENS_PLAY_AREA = ColumnInfo("Children'splayarea", "Indicates if there is a children's play area.")
    LIFT_AVAILABLE = ColumnInfo("LiftAvailable", "Indicates if a lift is available.")
    BED = ColumnInfo("BED", "Indicates if a bed is included.")
    VAASTU_COMPLIANT = ColumnInfo("VaastuCompliant", "Indicates if the property is Vaastu compliant.")
    MICROWAVE = ColumnInfo("Microwave", "Indicates if a microwave is included.")
    GOLF_COURSE = ColumnInfo("GolfCourse", "Indicates if the property has a golf course.")
    TV = ColumnInfo("TV", "Indicates if a TV is included.")
    DINING_TABLE = ColumnInfo("DiningTable", "Indicates if a dining table is included.")
    SOFA = ColumnInfo("Sofa", "Indicates if a sofa is included.")
    WARDROBE = ColumnInfo("Wardrobe", "Indicates if a wardrobe is included.")
    REFRIGERATOR = ColumnInfo("Refrigerator", "Indicates if a refrigerator is included.")



# Iterate through the attributes of the Feature class and print the info
for attr_name in dir(Column):
    # Skip the special class attributes
    if not attr_name.startswith("__"):
        feature_info = getattr(Column, attr_name)
        print(f"    {feature_info.name}: {feature_info.desc}")

print("\n")
raw_data.info()
print("\n")
raw_data.describe()

Samples: 2518
Columns:
    AC: Indicates if air conditioning is included.
    Area: The area of the property, probably in square feet.
    ATM: Indicates if there is an ATM nearby or within the complex.
    BED: Indicates if a bed is included.
    Cafeteria: Indicates if the property has a cafeteria.
    CarParking: Indicates if car parking is available.
    Children'splayarea: Indicates if there is a children's play area.
    ClubHouse: Indicates if the property has a clubhouse.
    DiningTable: Indicates if a dining table is included.
    Gasconnection: Indicates if the property has a gas connection.
    GolfCourse: Indicates if the property has a golf course.
    Gymnasium: Indicates if the property has a gym.
    Hospital: Indicates if there is a hospital nearby.
    IndoorGames: Indicates if the property has facilities for indoor games.
    Intercom: Indicates if the property has an intercom system.
    JoggingTrack: Indicates if the property has a jogging track.
    LandscapedGar

Unnamed: 0,Price,Area,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,RainWaterHarvesting,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
count,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,...,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0,2518.0
mean,9818380.0,1644.571088,2.623511,0.227562,0.412629,0.739873,0.70969,0.619142,0.559968,0.606434,...,0.975774,0.360207,0.607228,0.342732,0.330024,0.321287,0.334392,0.342335,0.341144,0.324861
std,8777113.0,746.256956,0.68733,0.419341,1.626445,1.611024,1.615231,1.624426,1.627686,1.625308,...,1.557725,1.622778,1.625256,1.621178,1.619894,1.618953,1.620347,1.621139,1.621023,1.619344
min,2000000.0,500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4760000.0,1160.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7754000.0,1500.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10900000.0,1829.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,165000000.0,9400.0,8.0,1.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
