In [None]:
try:
    import pandas as pd 
    import numpy as np
    print("imported successfully")
except ModuleNotFoundError:
    print("please istall madule")

imported successfully


In [None]:
#---------------------------------------
# Load Data 
#---------------------------------------
try:
    raw_df = pd.read_csv(
    "raw_data.csv", 
    parse_dates=['instance_date'],   # Converts to datetime
    index_col='instance_date',       # Sets it as the index immediately
    dayfirst=True                    # Tells pandas the day comes before month (DD-MM-YYYY)
)
    print(f"data_shape:{raw_df.shape}")
except FileNotFoundError:
    print("csv not founded")

print(raw_df.columns)

data_shape:(1047965, 45)
Index(['transaction_id', 'procedure_id', 'trans_group_id', 'trans_group_ar',
       'trans_group_en', 'procedure_name_ar', 'procedure_name_en',
       'property_type_id', 'property_type_ar', 'property_type_en',
       'property_sub_type_id', 'property_sub_type_ar', 'property_sub_type_en',
       'property_usage_ar', 'property_usage_en', 'reg_type_id', 'reg_type_ar',
       'reg_type_en', 'area_id', 'area_name_ar', 'area_name_en',
       'building_name_ar', 'building_name_en', 'project_number',
       'project_name_ar', 'project_name_en', 'master_project_en',
       'master_project_ar', 'nearest_landmark_ar', 'nearest_landmark_en',
       'nearest_metro_ar', 'nearest_metro_en', 'nearest_mall_ar',
       'nearest_mall_en', 'rooms_ar', 'rooms_en', 'has_parking',
       'procedure_area', 'actual_worth', 'meter_sale_price', 'rent_value',
       'meter_rent_price', 'no_of_parties_role_1', 'no_of_parties_role_2',
       'no_of_parties_role_3'],
      dtype='object')


**Extract Data**

In [None]:
#---------------------------------------
# Extract data that needed for analysis
#---------------------------------------  
raw_df=raw_df[["property_type_en","trans_group_en","property_sub_type_en",'property_usage_en',
               'area_name_en','building_name_en','project_name_en','nearest_landmark_en','nearest_metro_en',
              'nearest_mall_en','rooms_en','has_parking','procedure_area','actual_worth','meter_sale_price',
              'rent_value','meter_rent_price']]

# many column ended with "_en" so, we need to remove "_en" first 
raw_df.columns=raw_df.columns.str.replace("_en","")

# rename columns 
clean_df=raw_df.rename(columns={"has_parking":"parking",
                                "procedure_area":"area",
                                })

# convert 'property_type', 'trans_group', 'property_usage' to category datatype (memory optimization)
clean_df[['property_type', 'trans_group', 'property_usage']]=clean_df[['property_type', 'trans_group', 'property_usage']].astype("category")

# drop rows where the Index (Date) is missing/empty
clean_df = clean_df[clean_df.index.notna()]

# chacking how many year's data we have 
print(clean_df.index.year.sort_values(ascending=False).unique())

# ---------------------------------------------------------------
# we have to  analyze 2022 and 2023 data
# so,extract 2022 and 2023 data only
# ---------------------------------------------------------------

# Filter for rows where the year is 2022 OR 2023
clean_df = clean_df[clean_df.index.year.isin([2022, 2023])]

# reset index
clean_df.reset_index(inplace=True)

Index([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012,
       2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000,
       1999, 1998, 1997, 1996, 1995],
      dtype='int32', name='instance_date')


**remove white spaces of object dtype**

In [None]:
# Select only columns with 'object' data type
cols = clean_df.select_dtypes(include=['object']).columns

# Apply .str.strip() to those columns only
clean_df[cols] = clean_df[cols].apply(lambda x: x.str.strip())

**Check and handle null_values and duplicates**

In [None]:
# check duplicate_rows 
print(f"duplicate_rows:{(clean_df.duplicated().sum() / clean_df.shape[0]) * 100 } %") # duplicate in percentage
clean_df=clean_df.drop_duplicates()
print("duplicate_rows removed successfully")

# check null_values 
print((clean_df.isnull().sum() / clean_df.shape[0]) * 100 ) # null_row in percentage

# almost all row in "rental_value" and "meter_rent_price" are null 
# remove these columns
clean_df=clean_df.drop(['rent_value','meter_rent_price'],axis=1) 


duplicate_rows:4.16821394833708 %
duplicate_rows removed successfully
instance_date         0.000000
property_type         0.000000
trans_group           0.000000
property_sub_type    19.200935
property_usage        0.000000
area_name             0.000000
building_name        29.712367
project_name         19.610189
nearest_landmark     21.381129
nearest_metro        32.429621
nearest_mall         32.664160
rooms                19.828413
parking               0.000000
area                  0.000000
actual_worth          0.265811
meter_sale_price      0.000000
rent_value           99.506448
meter_rent_price     99.506448
dtype: float64


In [None]:
# remove null row in actual_worth
clean_df=clean_df.dropna(subset=['actual_worth'])

**we will convert "rooms" datatype from "object" to "integer"**

In [None]:
# check values in rooms 
print(clean_df["rooms"].value_counts())

rooms
1 B/R          40084
2 B/R          29727
Studio         18036
3 B/R          18012
4 B/R           7032
Office          3331
Shop             779
5 B/R            765
PENTHOUSE         97
6 B/R             32
Single Room       11
GYM                4
7 B/R              1
9 B/R              1
Name: count, dtype: int64


**clean the rooms column first**

In [None]:
# 1. Ensure the column is string type first
clean_df['rooms'] = clean_df['rooms'].astype(str)

# 2. Remove "B/R" (Handles "1 B/R", "2 B/R")
clean_df['rooms'] = clean_df['rooms'].str.replace("B/R", "", regex=False)

# 3. Replace text words with numbers using Regex (Handles spaces automatically)
# This replaces "Studio" -> "0", "Single Room" -> "1", "PENTHOUSE" -> "4"
# We use regex=True to catch them even if they have weird spaces like " PENTHOUSE"
clean_df['rooms'] = clean_df['rooms'].str.replace("Studio|Office|Shop|GYM", "0", regex=True)
clean_df['rooms'] = clean_df['rooms'].str.replace("Single Room", "1", regex=True)
clean_df['rooms'] = clean_df['rooms'].str.replace("PENTHOUSE", "4", regex=True)

# 4. Strip whitespace one last time (just in case)
clean_df['rooms'] = clean_df['rooms'].str.strip()

# 5. Convert to Numeric
clean_df['rooms'] = pd.to_numeric(clean_df['rooms'], errors='coerce') # covert to float



# 6. Check the result
print(clean_df['rooms'].value_counts(dropna=False))

rooms
1.0    40095
2.0    29727
NaN    28794
0.0    22150
3.0    18012
4.0     7129
5.0      765
6.0       32
7.0        1
9.0        1
Name: count, dtype: int64


In [None]:
# Check which property types have the most missing room values
missing_rooms = clean_df[clean_df['rooms'].isnull()]
print(missing_rooms['property_usage'].value_counts())
print(missing_rooms['property_type'].value_counts())

property_usage
Residential                              15172
Commercial                                9112
Other                                     3194
Residential / Commercial                   592
Multi-Use                                  372
Industrial                                 228
Agricultural                                51
Industrial / Commercial                     39
Hospitality                                 16
Industrial / Commercial / Residential       11
Storage                                      7
Name: count, dtype: int64
property_type
Land        17640
Villa        9034
Building     1319
Unit          801
Name: count, dtype: int64


In [None]:
def estimate_rooms(row):
    # 1. If it's Land, it always has 0 rooms
    if row['property_type'] == 'Land':
        return 0
    
    # 2. If it's Commercial, we often treat it as 0 rooms (or open space)
    if row['property_usage'] == 'Commercial':
        return 0

    # 3. If rooms is already valid (not NaN), return it
    if pd.notna(row['rooms']):
        return row['rooms']
    
    # 4. Heuristic rules for Residential Units
    area = row['area']
    if area < 50: return 0  # Studio
    elif 50 <= area < 90: return 1
    elif 90 <= area < 140: return 2
    elif 140 <= area < 200: return 3
    elif 200 <= area < 350: return 4
    else: return 5 # Large luxury unit 

# Apply this function ONLY to rows with missing rooms
clean_df['rooms'] = clean_df.apply(estimate_rooms, axis=1)

**Handling Remaining Missing Values**

In [None]:
# Fill missing categorical values with "Unknown" or similar
cols_to_fill = ['nearest_metro', 'nearest_mall', 'building_name', 'project_name', 'nearest_landmark','property_sub_type']
clean_df[cols_to_fill] = clean_df[cols_to_fill].fillna('Unknown')

print((clean_df.isnull().sum() / clean_df.shape[0]) * 100)

instance_date        0.0
property_type        0.0
trans_group          0.0
property_sub_type    0.0
property_usage       0.0
area_name            0.0
building_name        0.0
project_name         0.0
nearest_landmark     0.0
nearest_metro        0.0
nearest_mall         0.0
rooms                0.0
parking              0.0
area                 0.0
actual_worth         0.0
meter_sale_price     0.0
dtype: float64


**Remove "actual_worth" outlier**

In [None]:
# Calculate Q1 and Q3 for 'actual_worth'
Q1 = clean_df['actual_worth'].quantile(0.25)
Q3 = clean_df['actual_worth'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
clean_df = clean_df[(clean_df['actual_worth'] >= lower_bound) & (clean_df['actual_worth'] <= upper_bound)]

print(f"Data shape after removing price outliers: {clean_df.shape}")

Data shape after removing price outliers: (134897, 16)


**Remove "area" oulier** 

In [None]:
# Calculate Q1 and Q3 for 'area'
Q1_area = clean_df['area'].quantile(0.25)
Q3_area = clean_df['area'].quantile(0.75)
IQR_area = Q3_area - Q1_area

# Define bounds for area
lower_bound_area = Q1_area - 1.5 * IQR_area
upper_bound_area = Q3_area + 1.5 * IQR_area

# Filter the data (Keep rows where area is within bounds)
clean_df = clean_df[(clean_df['area'] >= lower_bound_area) & (clean_df['area'] <= upper_bound_area)]

# Final sanity check: Ensure no 0 or negative areas exist
clean_df = clean_df[clean_df['area'] > 0]

print(f"Final Data shape after removing area outliers: {clean_df.shape}")

Final Data shape after removing area outliers: (121269, 16)


In [None]:
# save the cleaned csv file
clean_df.to_csv("cleaned_dubai_data.csv",index=False)