In [None]:
# Download GDrive files using python
!pip install -q gdown

# Download datasets from GDrive folder and create 'midterm_folder'
!gdown 1kbO_PMxqTOX5AOBgGKuv2Xzxeh3J-zmt -O midterm_folder

Downloading...
From (original): https://drive.google.com/uc?id=1kbO_PMxqTOX5AOBgGKuv2Xzxeh3J-zmt
From (redirected): https://drive.google.com/uc?id=1kbO_PMxqTOX5AOBgGKuv2Xzxeh3J-zmt&confirm=t&uuid=b3995940-8b2f-4abd-8e47-0c4d9ef91c86
To: /content/midterm_folder
100% 443M/443M [00:05<00:00, 75.2MB/s]


In [None]:
import pandas as pd
import os

file_path = '/content/midterm_folder'
df = pd.read_csv(file_path)
file_size_bytes = os.path.getsize(file_path)
file_size_mb = file_size_bytes / (1024 ** 2)

print("df.shape:", df.shape) # 515344 row and 90 feature + 1 target
print(f"Ukuran file: {file_size_mb:.2f} MB")

df.head() # Header non-existent, first column is not a feature

df.shape: (515344, 91)
Ukuran file: 422.88 MB


Unnamed: 0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
0,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
1,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
2,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
3,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903
4,2001,50.54767,0.31568,92.35066,22.38696,-25.5187,-19.04928,20.67345,-5.19943,3.63566,...,6.59753,-50.69577,26.02574,18.9443,-0.3373,6.09352,35.18381,5.00283,-11.02257,0.02263


In [None]:
# Split the original data for test file

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("test_df.shape:", test_df.shape)
test_df.head()

test_df.shape: (103069, 91)


Unnamed: 0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
201297,2008,44.09533,3.25213,6.4771,17.69897,-23.67294,-21.82566,15.18481,-0.77401,9.02418,...,4.48519,-119.21003,-233.02599,78.49102,13.40812,-296.73533,-104.05398,16.81235,-224.4607,-21.68576
75576,1993,38.90385,-8.67212,31.33171,-7.999,0.46314,-2.96364,-27.22097,-1.84019,11.14816,...,20.8432,-198.2405,270.72953,143.22468,0.53682,174.226,415.35818,5.54063,42.25141,27.75021
46834,2006,49.15728,27.23024,31.62364,1.06704,-22.8477,-11.96281,11.30337,5.49682,7.02751,...,4.38046,-32.79098,-12.28873,14.90571,-3.15337,43.95009,-90.2681,4.32582,19.30317,-16.91637
481423,1992,36.45757,-57.98751,55.2172,6.74121,-38.01408,-8.91355,13.01662,1.44185,2.08769,...,15.49053,-276.88177,196.96765,54.24117,3.65038,-68.31565,122.28213,7.70799,89.50155,11.27349
90320,1998,42.94,20.84997,28.13751,-3.64397,16.60444,-0.39047,-23.82962,-1.82006,-3.71131,...,-5.14548,-67.79813,95.24916,72.04623,-7.48565,131.23753,-91.93314,5.46681,-14.33837,-8.40538




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separating target and defining XYs
target_col = df.columns[1]
X = df.drop(columns=[target_col])
y = df[target_col]

# Fill missing values
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
categoric_cols = X.select_dtypes(include=['object']).columns
X[categoric_cols] = X[categoric_cols].fillna("missing")

print("Numeric features:", len(numeric_cols))
print("Categoric features:", len(categoric_cols))

# MinMax Scaler and One Hot Encoder
preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categoric_cols)
    ]
)

# Identifying outliers with IQR (less aggressive)
print("\nData Before IQR:", len(df))

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 2.5 * IQR
    upper_bound = Q3 + 2.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Data After IQR:", len(df))

# Split with Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

print("\nShape After Train:", X_train_processed.shape)
print("Shape After Test:", X_test_processed.shape)

Numeric features: 90
Categoric features: 0
Data Before IQR: 515344
Data After IQR: 251955

Shape After Train: (412275, 90)
Shape After Test: (103069, 90)


In [None]:
# Model: Linear Regression (no real hyperparameters)
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_processed, y_train)

print("Model training completed.")

Model training completed.


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_test_processed)

mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print("MSE :", mse)
print("RMSE:", rmse)
print("MAE :", mae)
print("R²  :", r2)

MSE : 10.620849315455532
RMSE: 3.2589644544633396
MAE : 2.432033286398339
R²  : 0.7130097106806829
