In [2]:
import numpy as np 
import matplotlib.pyplot as pyplot
import pandas as pd
import streamlit as st
import joblib 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [8]:
data = pd.read_csv('dataset/housing_new.csv')



In [9]:
data = data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
data.columns


Index(['flat_type', 'floor_area_sqm', 'lease_commence_date', 'resale_price',
       'district', 'range_numeric', 'new_date'],
      dtype='object')

In [10]:
data = data[['new_date','flat_type', 'district', 'range_numeric' ,'floor_area_sqm', 'lease_commence_date','resale_price']]

In [11]:
print(data)

           new_date flat_type    district  range_numeric  floor_area_sqm  \
0       1990.083333    1 ROOM  North-East           11.0            31.0   
1       1990.083333    1 ROOM  North-East            5.0            31.0   
2       1990.083333    1 ROOM  North-East           11.0            31.0   
3       1990.083333    1 ROOM  North-East            8.0            31.0   
4       1990.083333    3 ROOM  North-East            5.0            73.0   
...             ...       ...         ...            ...             ...   
869640  2024.916667    5 ROOM       North            2.0           121.0   
869641  2024.916667    5 ROOM       North            8.0           135.0   
869642  2024.916667    5 ROOM       North            8.0           112.0   
869643  2024.916667    5 ROOM       North           11.0           113.0   
869644  2024.916667    5 ROOM       North            2.0           122.0   

        lease_commence_date  resale_price  
0                      1977        9000.0  

In [12]:

# Define categorical and numerical columns
categorical_columns = ['district', 'flat_type']
numerical_columns = [ 'floor_area_sqm', 'range_numeric']

# Define transformers
categorical_transformer = OneHotEncoder(drop = 'first',sparse_output=False, handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(    
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder= 'passthrough'

)

# Fit the preprocessor on the full dataset
preprocessor.fit(data)

# Save the preprocessor for future use
joblib.dump(preprocessor, 'preprocessor_data.pkl')


 

['preprocessor_data.pkl']

In [51]:
# Transform an example new data point
new_data = pd.DataFrame({
    'new_date' : 2024,
    'flat_type': ['4 ROOM'],
    'district': ['East'],
    'range_numeric': [12],
    'floor_area_sqm': [105],    
    'lease_commence_date': [1995],
})



In [27]:
loaded_preprocessor = joblib.load('preprocessor_predictor.pkl')

In [52]:
# Load the preprocessor and transform the new data
transformed_new_data = loaded_preprocessor.transform(new_data)
# Extract feature names from the transformers
categorical_columns = ['district', 'flat_type']
numerical_columns = ['floor_area_sqm', 'range_numeric']

num_feature_names = numerical_columns
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)
remainder_feature_names = [col for col in new_data.columns if col not in numerical_columns + categorical_columns]

# Combine all feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names) + remainder_feature_names

# Create a DataFrame with feature names
transformed_new_data = pd.DataFrame(transformed_new_data, columns=all_feature_names)
print("Transformed New Data:")
print(transformed_new_data)

Transformed New Data:
   floor_area_sqm  range_numeric  district_East  district_North  \
0        0.615177       0.867699            1.0             0.0   

   district_North-East  district_West  flat_type_2 ROOM  flat_type_3 ROOM  \
0                  0.0            0.0               0.0               0.0   

   flat_type_4 ROOM  flat_type_5 ROOM  new_date  lease_commence_date  
0               1.0               0.0    2024.0               1995.0  


In [54]:
model_rf = joblib.load('models/new_model_rf.pkl')
model_xgb = joblib.load('models/new_model_xgb.pkl')
model_cat = joblib.load('models/new_model_cat.pkl')

In [55]:
best_weight = [0.32,0.23,0.45]

In [56]:

y_pred = model_xgb.predict(transformed_new_data)*best_weight[0]+ model_rf.predict(transformed_new_data)*best_weight[1] +model_cat.predict(transformed_new_data)*best_weight[2]
print("Resale Valuation")
print(y_pred)

Resale Valuation
[587081.96636163]


In [None]:
#District Mapping

district_mapping = {
    'ANG MO KIO': 'North-East',
    'BEDOK': 'East',
    'BISHAN': 'Central',
    'BUKIT BATOK': 'West',
    'BUKIT MERAH': 'Central',
    'BUKIT TIMAH': 'Central',
    'CENTRAL AREA': 'Central',
    'CHOA CHU KANG': 'West',
    'CLEMENTI': 'West',
    'GEYLANG': 'Central',
    'HOUGANG': 'North-East',
    'JURONG EAST': 'West',
    'JURONG WEST': 'West',
    'KALLANG_WHAMPOA': 'Central',
    'MARINE PARADE': 'East',
    'QUEENSTOWN': 'Central',
    'SENGKANG': 'North-East',
    'SERANGOON': 'North-East',
    'TAMPINES': 'East',
    'TOA PAYOH': 'Central',
    'WOODLANDS': 'North',
    'YISHUN': 'North',
    'LIM CHU KANG': 'West',
    'SEMBAWANG': 'North',
    'BUKIT PANJANG': 'West',
    'PASIR RIS': 'East',
    'PUNGGOL': 'North-East',
}

