In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, ShuffleSplit, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('mudah-apartment-kl-selangor.csv')
df.shape

(19991, 14)

In [3]:
df.property_type.unique()

array(['Condominium', 'Apartment', 'Service Residence', 'Studio', 'Flat',
       'Duplex', 'Others', 'Townhouse Condo',
       'Condo / Services residence / Penthouse / Townhouse',
       'Residential', 'Bungalow House', 'Houses', 'Soho'], dtype=object)

In [4]:
df['parking'] = df['parking'].fillna(df['parking'].median())
df = df.dropna(subset=['monthly_rent','rooms','bathroom','additional_facilities','furnished'])
df = df.drop(columns=['prop_name','completion_year','additional_facilities','facilities','ads_id'])
df['monthly_rent'] = df['monthly_rent'].apply(lambda x: int(re.sub(r'[^\d]', '', x)))
df['size'] = df['size'].apply(lambda x: int(re.sub(r'[^\d]', '', x)))
df = df.drop(df[df['rooms'] == 'More than 10'].index)
df['rooms'] = df['rooms'].astype(float)
df['location'] = df['location'].str.split(' - ').str[1]
df.reset_index(drop=True, inplace=True)


In [5]:
df['price_per_sqft'] = df['monthly_rent']/df['size']

In [6]:
location_stats = df.groupby('location')['location'].agg('count').sort_values()
location_less = location_stats[location_stats <= 37]
df.location = df.location.apply(lambda x: 'Other' if x in location_less else x)
# List of property types to replace with 'Other'
replace_list = ['Flat', 'Studio', 'Others', 'Duplex', 'Townhouse Condo']

# Replace specified values with 'Other'
df['property_type'] = df['property_type'].replace(replace_list, 'Other')

In [7]:
#standard deviation outlier
def find_outliers(df,column):

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    IQR = Q3 - Q1
    print(IQR)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[column] > upper_bound) | (df[column] < lower_bound)]
    df_cleaned = df.drop(outliers.index)
    return df_cleaned

In [8]:
df = find_outliers(df,'price_per_sqft')
# df = find_outliers(df,'parking')
df = df.drop(columns=['price_per_sqft'])
df = df.drop(columns=['region'])

0.9339791274114955


In [9]:
# Define the preprocessing for numerical features
numeric_features = ['rooms', 'parking', 'bathroom', 'size']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Define the preprocessing for categorical features
categorical_features = ['location','property_type', 'furnished']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='error'))])

# Combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [10]:
# Define the model
model = RandomForestRegressor(random_state=10)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split the data into training and testing sets
X = df.drop(columns=['monthly_rent'])
y = df['monthly_rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 174245.9268509761


In [250]:
import pickle
with open('price_model.pickle','wb') as f:
    pickle.dump(pipeline, f)

In [30]:
import pickle
# Load the pipeline from the file
with open('price_model.pickle', 'rb') as file:
    loaded_pipeline = pickle.load(file)

In [24]:
import json
# Get the uniques
location_names_list = sorted(df['location'].unique().tolist())
furnished_list = sorted(df['furnished'].unique().tolist())
property_type_list = sorted(df['property_type'].unique().tolist())

# Combine
data = {
    'locations': location_names_list,
    'furnished': furnished_list,
    'property_types': property_type_list
}

# Save the dictionary to a JSON file
with open('value_names.json', 'w') as file:
    json.dump(data, file, indent=4)

In [31]:
# Function to predict rent price
def predict_rent_price(model_pipeline, location, property_type, rooms, parking, bathroom, size, furnished):
    data = {
        'location': location,
        'property_type': property_type,
        'rooms': rooms,
        'parking': parking,
        'bathroom': bathroom,
        'size': size,
        'furnished': furnished
    }

    input_df = pd.DataFrame([data])
    print(input_df)
    prediction = model_pipeline.predict(input_df)[0]
    return prediction

In [69]:
predict_rent_price(loaded_pipeline, "Cyberjaya", "Condominium", 2,0,1,500,'Partially Furnished')

    location property_type  rooms  parking  bathroom  size  \
0  Cyberjaya   Condominium      2        0         1   500   

             furnished  
0  Partially Furnished  


np.float64(1107.4572562160063)

In [13]:
# Load the locations from the JSON file
with open('value_names.json', 'r') as file:
    locations = json.load(file)


In [21]:
locations

{'locations': ['Taman Desa',
  'Cheras',
  'Sentul',
  'Mont Kiara',
  'Setapak',
  'Ampang',
  'Segambut',
  'Other',
  'Bukit Jalil',
  'Kepong',
  'KL City',
  'Wangsa Maju',
  'Solaris Dutamas',
  'Jalan Kuching',
  'Desa Pandan',
  'Old Klang Road',
  'Jalan Ipoh',
  'KLCC',
  'Gombak',
  'Pantai',
  'Sungai Besi',
  'Sri Petaling',
  'Bangsar South',
  'Bukit Bintang',
  'Kuchai Lama',
  'Titiwangsa',
  'Keramat',
  'OUG',
  'Pandan Perdana',
  'Sri Hartamas',
  'Desa Petaling',
  'Puchong',
  'Taman Melawati',
  'Ampang Hilir',
  'Sri Damansara',
  'Cyberjaya',
  'Shah Alam',
  'Petaling Jaya',
  'Klang',
  'Bandar Sunway',
  'Seri Kembangan',
  'Kajang',
  'Kota Damansara',
  'Batu Caves',
  'Semenyih',
  'Damansara Damai',
  'Bandar Mahkota Cheras',
  'Sepang',
  'Selayang',
  'Sungai Buloh',
  'Bangi',
  'Setia Alam',
  'Dengkil',
  'Subang Jaya',
  'Ara Damansara',
  'I-City',
  'Rawang',
  'Damansara Perdana',
  'Puncak Alam',
  '360',
  'Balakong'],
 'furnished': ['Fully F