In [2]:
import numpy as np 
import matplotlib.pyplot as pyplot
import pandas as pd
import streamlit as st
import joblib 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# Define categorical and numerical columns
categorical_columns = ['district', 'flat_type']
numerical_columns = ['remaining_lease', 'floor_area_sqm', 'range_numeric']

# Define preprocessors for numerical and categorical data
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
numerical_transformer = StandardScaler()

# Combine into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),  # Apply scaling to numerical columns
        ('cat', categorical_transformer, categorical_columns)  # Apply one-hot encoding to categorical columns
    ],
    remainder='passthrough'  # Leave other columns as they are
)

# Display the preprocessor structure
print(preprocessor)


In [8]:
data = pd.read_csv('housing_new.csv')



In [9]:
data = data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
data.columns


Index(['flat_type', 'floor_area_sqm', 'lease_commence_date', 'resale_price',
       'district', 'range_numeric', 'new_date'],
      dtype='object')

In [10]:
data = data[['new_date','flat_type', 'district', 'range_numeric' ,'floor_area_sqm', 'lease_commence_date', 'resale_price',]]

In [11]:

# Define categorical and numerical columns
categorical_columns = ['district', 'flat_type']
numerical_columns = [ 'floor_area_sqm', 'range_numeric']

# Define transformers
categorical_transformer = OneHotEncoder(drop = 'first',sparse_output=False, handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(    
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder= 'passthrough'

)

# Fit the preprocessor on the full dataset
preprocessor.fit(data)

# Save the preprocessor for future use
joblib.dump(preprocessor, 'preprocessor.pkl')


 

['preprocessor.pkl']

In [43]:
# Transform an example new data point
new_data = pd.DataFrame({
    'year' : 2100,
    'district': ['East'],
    'flat_type': ['5 ROOM'],
    'remaining_lease': [71],
    'floor_area_sqm': [100],
    'range_numeric': [2],
    'lease_commence_date': [1996],
})



In [44]:
# Load the preprocessor and transform the new data
loaded_preprocessor = joblib.load('preprocessor.pkl')
transformed_new_data = loaded_preprocessor.transform(new_data)
# Extract feature names from the transformers
categorical_columns = ['district', 'flat_type']
numerical_columns = ['remaining_lease', 'floor_area_sqm', 'range_numeric']

num_feature_names = numerical_columns
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)
remainder_feature_names = [col for col in new_data.columns if col not in numerical_columns + categorical_columns]

# Combine all feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names) + remainder_feature_names

# Create a DataFrame with feature names
transformed_new_data = pd.DataFrame(transformed_new_data, columns=all_feature_names)
transformed_new_data = transformed_new_data[['year', 'floor_area_sqm', 'range_numeric', 'lease_commence_date',
       'remaining_lease', 'flat_type_2 ROOM', 'flat_type_3 ROOM',
       'flat_type_4 ROOM', 'flat_type_5 ROOM', 'flat_type_EXECUTIVE',
       'flat_type_MULTI-GENERATION', 'district_East', 'district_North',
       'district_North-East', 'district_West']]
print("Transformed New Data:")
print(transformed_new_data)

Transformed New Data:
     year  floor_area_sqm  range_numeric  lease_commence_date  \
0  2100.0        0.167187       -1.17848               1996.0   

   remaining_lease  flat_type_2 ROOM  flat_type_3 ROOM  flat_type_4 ROOM  \
0        -0.919453               0.0               0.0               0.0   

   flat_type_5 ROOM  flat_type_EXECUTIVE  flat_type_MULTI-GENERATION  \
0               1.0                  0.0                         0.0   

   district_East  district_North  district_North-East  district_West  
0            1.0             0.0                  0.0            0.0  


In [45]:
model = joblib.load('price_prediction_model_all.pkl')
y_pred = model.predict(transformed_new_data)
print("Resale Valuation")
print(y_pred)

Resale Valuation
[666366.80964031]


In [None]:
#District Mapping

district_mapping = {
    'ANG MO KIO': 'North-East',
    'BEDOK': 'East',
    'BISHAN': 'Central',
    'BUKIT BATOK': 'West',
    'BUKIT MERAH': 'Central',
    'BUKIT TIMAH': 'Central',
    'CENTRAL AREA': 'Central',
    'CHOA CHU KANG': 'West',
    'CLEMENTI': 'West',
    'GEYLANG': 'Central',
    'HOUGANG': 'North-East',
    'JURONG EAST': 'West',
    'JURONG WEST': 'West',
    'KALLANG_WHAMPOA': 'Central',
    'MARINE PARADE': 'East',
    'QUEENSTOWN': 'Central',
    'SENGKANG': 'North-East',
    'SERANGOON': 'North-East',
    'TAMPINES': 'East',
    'TOA PAYOH': 'Central',
    'WOODLANDS': 'North',
    'YISHUN': 'North',
    'LIM CHU KANG': 'West',
    'SEMBAWANG': 'North',
    'BUKIT PANJANG': 'West',
    'PASIR RIS': 'East',
    'PUNGGOL': 'North-East',
}

