# Dubai House Price Project
### Submitted by Aswathy Gopalakrishnan

### Import Libraries

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [56]:
data = pd.read_csv('housing_price_dataset.csv')

### Data Understanding

In [57]:
rows, cols = data.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

Number of rows: 50000
Number of columns: 6


In [58]:
print("\nData types of each column:")
print(data.dtypes)


Data types of each column:
SquareFeet        int64
Bedrooms          int64
Bathrooms         int64
Neighborhood     object
YearBuilt         int64
Price           float64
dtype: object


In [59]:
print("\nUnique values in each column:")
for column in data.columns:
    unique_vals = data[column].unique()
    print(f"{column} ({len(unique_vals)} unique values): {unique_vals}")


Unique values in each column:
SquareFeet (2000 unique values): [2126 2459 1860 ... 2864 2330 2084]
Bedrooms (4 unique values): [4 3 2 5]
Bathrooms (3 unique values): [1 2 3]
Neighborhood (3 unique values): ['Rural' 'Suburb' 'Urban']
YearBuilt (72 unique values): [1969 1980 1970 1996 2001 2020 1993 1957 1959 2004 1951 1987 1992 1989
 1976 1956 1977 1979 1962 1999 1978 1963 2013 2002 1952 2014 1965 2006
 2008 1982 1966 1971 2021 1960 2016 2007 1994 1988 2000 2015 1983 1961
 1972 2017 1985 1975 2012 2019 1995 1986 1981 1954 1958 2005 1984 1997
 2003 1964 2018 1953 1968 1955 1967 1950 1998 1991 2010 1973 2011 1990
 1974 2009]
Price (50000 unique values): [215355.2836182  195014.22162585 306891.01207633 ... 384110.55559035
 380512.68595684 221618.58321807]


### Missing and Duplicate Values

In [60]:
missing_counts = data.isnull().sum()
print("\nMissing values per column:")
print(missing_counts)


Missing values per column:
SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64


In [61]:
duplicate_count = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


### Handling Data Ambiguities

Some values in price columns are negative. These are made positive by taking the absolute value.

In [62]:
# 1. Handle negative values in the 'Price' column
negative_price_count = data[data['Price'] < 0].shape[0]
print(f"Number of rows with negative prices: {negative_price_count}")

# Convert negative prices to positive
data['Price'] = abs(data['Price'])
print("Negative prices converted to positive.")


Number of rows with negative prices: 22
Negative prices converted to positive.


Some prices look abnormal with considerably less values even for a high squarefeet, large number of bedrooms, and a good neighbourhood.Let's print these rows by selecting a threshold of 100000

In [63]:

# Check for prices below 10000.9321
below_threshold_rows = data[data['Price'] < 10000.0000]
print("\nRows with prices below 10000.0000:")
print(below_threshold_rows)


Rows with prices below 10000.0000:
       SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt        Price
2310         1036         4          1       Suburb       1983  7550.504574
4537         1102         2          1       Suburb       1967  5796.058628
4993         1161         2          2       Suburb       1972  8336.559189
5426         1492         3          1       Suburb       1970  9255.923304
5951         1097         4          3        Rural       1981  4537.418615
10597        1177         2          3        Urban       2010   434.097124
11991        1213         4          1       Suburb       2020  4910.415323
15131        1178         4          2        Urban       2010  6124.032174
17216        1013         2          1       Suburb       2018   276.063516
17442        1600         2          3        Rural       1989  8238.884499
20759        1036         2          2        Urban       1957  4810.724320
23650        1024         4          3       Suburb 

The prices in those rows seem to have a misplaced decimal point, making them unrealistically low. Let's correct this by shifting the decimal point so there are 5 digits before it.

In [None]:
# Identify rows where price is below 10000
below_threshold_rows = data['Price'] < 10000.0000

# Function to shift the decimal to have 5 digits before it
def correct_decimal(price):
    if price == 0:  # Avoid issues with zero values
        return 0
    order = np.floor(np.log10(price))  # Find the order of magnitude of the price
    if order < 5:  # Less than 6 digits before decimal
        shift = 5 - order  # Calculate how many places to shift
        price *= (10 ** shift)  # Multiply to shift the decimal
    return price

# Apply the correction to the selected rows
data.loc[below_threshold_rows, 'Price'] = data.loc[below_threshold_rows, 'Price'].apply(correct_decimal)

# Verify the correction
print("Corrected rows:")
print(data.loc[below_threshold_rows])


Corrected rows:
       SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price
2310         1036         4          1       Suburb       1983  755050.457436
4537         1102         2          1       Suburb       1967  579605.862781
4993         1161         2          2       Suburb       1972  833655.918927
5426         1492         3          1       Suburb       1970  925592.330405
5951         1097         4          3        Rural       1981  453741.861489
10597        1177         2          3        Urban       2010  434097.124094
11991        1213         4          1       Suburb       2020  491041.532315
15131        1178         4          2        Urban       2010  612403.217362
17216        1013         2          1       Suburb       2018  276063.516078
17442        1600         2          3        Rural       1989  823888.449883
20759        1036         2          2        Urban       1957  481072.432043
23650        1024         4          3       Sub

### Data Wrangling

Adding three additional columns for future visualization and analysis purposes.

In [65]:
# 1. price_per_sqft
data['price_per_sqft'] = data['Price'] / data['SquareFeet']

# 2. property_age
current_year = 2025
data['property_age'] = current_year - data['YearBuilt']

# 3. listing_category based on Price quantiles
quantiles = data['Price'].quantile([0.33, 0.66])

def categorize_price(price):
    if price <= quantiles[0.33]:
        return 'Budget'
    elif price <= quantiles[0.66]:
        return 'Mid-Range'
    else:
        return 'High-End'

data['listing_category'] = data['Price'].apply(categorize_price)

# Display the first few rows to verify
print(data.head())


   SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price  \
0        2126         4          1        Rural       1969  215355.283618   
1        2459         3          2        Rural       1980  195014.221626   
2        1860         2          1       Suburb       1970  306891.012076   
3        2294         2          1        Urban       1996  206786.787153   
4        2130         5          2       Suburb       2001  272436.239065   

   price_per_sqft  property_age listing_category  
0      101.295994            56        Mid-Range  
1       79.306312            45        Mid-Range  
2      164.995168            55         High-End  
3       90.142453            29        Mid-Range  
4      127.904338            24         High-End  


### Saving File for Dashboard Creation

In [66]:
data.to_csv('processed_data.csv', index=False)