In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

import matplotlib.pyplot as plt



In [3]:
url = "https://raw.githubusercontent.com/ahnaf0806/Data-Analyst/refs/heads/main/Semester%203/Dataset/rumah-yogya.csv"
data = pd.read_csv(url)



In [4]:
# ======= INFROMASI DATASET ======
print(data.info())

print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             2020 non-null   object 
 1   nav-link          2020 non-null   object 
 2   description       2020 non-null   object 
 3   listing-location  2020 non-null   object 
 4   bed               2001 non-null   float64
 5   bath              1999 non-null   float64
 6   carport           1713 non-null   float64
 7   surface_area      2019 non-null   object 
 8   building_area     2019 non-null   object 
dtypes: float64(3), object(6)
memory usage: 142.2+ KB
None
            price                                           nav-link  \
0  Rp 1,79 Miliar  https://www.rumah123.com/properti/sleman/hos17...   
1     Rp 170 Juta  https://www.rumah123.com/properti/sleman/hos17...   
2     Rp 695 Juta  https://www.rumah123.com/properti/sleman/hos17...   
3     Rp 560 Juta  https://www

In [5]:
# ====== MERUBAH DATA TYPE MENJADI NUMERIK ======
def convert_area(x):
    if pd.isna(x): return np.nan
    s = str(x).lower().replace("m²","").replace("m2","").strip()
    s = s.replace(".", "").replace(",", ".")
    try:
        return float(s)
    except:
        return np.nan

data["surface_area"]  = data["surface_area"].apply(convert_area)
data["building_area"] = data["building_area"].apply(convert_area)


# ====== CLEAN PRICE ======
def clean_price(x):
    if pd.isna(x):
        return np.nan
    
    s = str(x).lower().replace("rp", "").strip()

    num_match = re.search(r"[\d.,]+", s)
    if not num_match:
        return np.nan

    num_str = num_match.group().replace(".", "").replace(",", ".")
    val = float(num_str)

    if "miliar" in s:
        val *= 1_000_000_000
    elif "juta" in s:
        val *= 1_000_000
    else:
        val *= 1_000_000 

    return val

data["price"] = data["price"].apply(clean_price)
data["price"] = pd.to_numeric(data["price"], errors="coerce")

# ====== DROP PRICE NaN DULU (WAJIB SEBELUM ASTYPE) ======
data = data.dropna(subset=["price"])

# ====== BARU BOLEH JADI INT ======
data["price"] = data["price"].round(0).astype("int64")

print(data["price"].head())


0    1790000000
1     170000000
2     695000000
3     560000000
4     200000000
Name: price, dtype: int64


In [6]:

print("\n INFOMARSI DATA SETELAH CLEANING")
print(data.info())


 INFOMARSI DATA SETELAH CLEANING
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             2020 non-null   int64  
 1   nav-link          2020 non-null   object 
 2   description       2020 non-null   object 
 3   listing-location  2020 non-null   object 
 4   bed               2001 non-null   float64
 5   bath              1999 non-null   float64
 6   carport           1713 non-null   float64
 7   surface_area      2019 non-null   float64
 8   building_area     2019 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 142.2+ KB
None


In [7]:
# =========================================
# MEMBERSIHKAN DATA YANG HILANG
# =========================================
data = data.dropna(subset=['price'])

prediktor = ["bed", "bath", "carport", "surface_area", "building_area"]

# CEK RATA RATA DATA PREDIKTOR
mean_prediktor = data[prediktor].mean()
print("\n RATA RATA DATA PREDIKTOR")
print(mean_prediktor)

# ISI DATA YANG HILANG DENGAN RATA RATA
data[prediktor] = data[prediktor].fillna(mean_prediktor)

print("\n INFORMASI DATA SETELAH MENGISI DATA YANG HILANG")
print(data.info())

print(f"\n{data.isnull().sum()}")


 RATA RATA DATA PREDIKTOR
bed              3.969015e+00
bath             2.941971e+00
carport          1.559253e+00
surface_area     1.857112e+02
building_area    1.192893e+06
dtype: float64

 INFORMASI DATA SETELAH MENGISI DATA YANG HILANG
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             2020 non-null   int64  
 1   nav-link          2020 non-null   object 
 2   description       2020 non-null   object 
 3   listing-location  2020 non-null   object 
 4   bed               2020 non-null   float64
 5   bath              2020 non-null   float64
 6   carport           2020 non-null   float64
 7   surface_area      2020 non-null   float64
 8   building_area     2020 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 142.2+ KB
None

price               0
nav-link            0
description         0

In [8]:
data = data[data["building_area"] < 2000]
Q1 = data["building_area"].quantile(0.25)
Q3 = data["building_area"].quantile(0.75)
IQR = Q3 - Q1
low, high = Q1 - 1.5*IQR, Q3 + 1.5*IQR

data = data[(data["building_area"] >= low) & (data["building_area"] <= high)]

In [9]:
# ===== MEMILIH DATA TEST DAN DATA TRAIN =====
fitur = ["bed", "bath", "carport", "surface_area", "building_area", "listing-location"]

x = data[prediktor]
y = data['price']


In [10]:
# ===== MEMBAGI DATASET MENJADI DATA TEST DAN TRAIN ====
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"Bentuk dari X_train: {x_train.shape}")
print(f"Bentuk dari X_test: {x_test.shape}")
print(f"Bentuk dari y_train: {y_train.shape}")
print(f"Bentuk dari y_test: {y_test.shape}")

Bentuk dari X_train: (1481, 5)
Bentuk dari X_test: (371, 5)
Bentuk dari y_train: (1481,)
Bentuk dari y_test: (371,)


In [11]:
# ===== MODEL =====
model = LinearRegression()
# ===== MELATIH MODEL MENGGUNAKNA DATA TRAIN =====
model.fit(x_train, y_train)
# ===== PREDIKSI MENGGUNAKAN DATA TEST =====
y_pred = model.predict(x_test)

In [12]:
# ===== MENGHITUNG METRIK EVALUASI =====
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Squared Error (mse): {mse}")
print(f"Root Mean Squared error(rmse): {rmse}")
print(f"R-Squared(R2): {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Squared Error (mse): 5.585041856396415e+17
Root Mean Squared error(rmse): 747331376.0572624
R-Squared(R2): 0.5565328947824872
Mean Absolute Percentage Error: 0.3467262540423527


In [13]:
print(data[["surface_area","building_area"]].describe())


       surface_area  building_area
count   1852.000000    1852.000000
mean     144.910907     109.656587
std      109.627108      64.078413
min        4.000000       4.000000
25%       93.000000      60.000000
50%      114.000000      91.000000
75%      151.000000     150.000000
max     1640.000000     302.000000
