<a href="https://colab.research.google.com/github/abidrozhan/MidTerm-Deep-Learning/blob/main/AbidRozhan_Midterm_Regression_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Instalasi dependensi dan pengunduhan dataset
!pip install -q gdown scikit-learn tensorflow

In [2]:
import pandas as pd

# Membaca file CSV, melewati baris yang rusak
raw_df = pd.read_csv('/content/midterm-regresi-dataset.csv', header=None, on_bad_lines='skip')

# Kolom pertama adalah target, sisanya fitur
raw_df.rename(columns={0: 'year'}, inplace=True)
feature_cols = [f'feature_{i}' for i in range(raw_df.shape[1] - 1)]
for idx, col in enumerate(feature_cols):
    raw_df.rename(columns={idx+1: col}, inplace=True)

raw_df.head()

Unnamed: 0,year,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_80,feature_81,feature_82,feature_83,feature_84,feature_85,feature_86,feature_87,feature_88,feature_89
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [3]:
# Cell 3: Preprocessing – Imputasi nilai hilang dan scaling
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

# Pisahkan fitur dan target
X = raw_df[feature_cols]
y = raw_df['year']

# Convert all feature columns to numeric, coercing errors to NaN
for col in feature_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Preprocessor: imputasi median dan standardisasi
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), feature_cols)
    ]
)

# Split data menjadi train dan test
# (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Terapkan preprocessing ke training dan test
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

# Ubah target ke array numpy
y_train = y_train.values
y_test = y_test.values

print('Preprocessing selesai. Bentuk data latih:', X_train_prepared.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.to_numeric(X[col], errors='coerce')


Preprocessing selesai. Bentuk data latih: (412276, 90)


In [4]:
# Cell 4: Membangun dan melatih model jaringan saraf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Definisikan arsitektur model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_prepared.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(1)  # Output layer (regresi)
])

# Kompilasi model
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae', 'mse']
)

# Callback untuk early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Melatih model
history = model.fit(
    X_train_prepared, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=2
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5154/5154 - 19s - 4ms/step - loss: 248351.7031 - mae: 310.7319 - mse: 248351.7031 - val_loss: 5604.5190 - val_mae: 48.5230 - val_mse: 5604.5190
Epoch 2/100
5154/5154 - 16s - 3ms/step - loss: 25279.4160 - mae: 125.7791 - mse: 25279.4160 - val_loss: 1282.7109 - val_mae: 24.7331 - val_mse: 1282.7109
Epoch 3/100
5154/5154 - 16s - 3ms/step - loss: 22251.5410 - mae: 118.7861 - mse: 22251.5410 - val_loss: 1133.0833 - val_mae: 24.3972 - val_mse: 1133.0833
Epoch 4/100
5154/5154 - 16s - 3ms/step - loss: 21683.5840 - mae: 117.4254 - mse: 21683.5840 - val_loss: 743.4722 - val_mae: 20.2123 - val_mse: 743.4722
Epoch 5/100
5154/5154 - 16s - 3ms/step - loss: 21167.5684 - mae: 116.1272 - mse: 21167.5684 - val_loss: 763.9169 - val_mae: 20.2673 - val_mse: 763.9169
Epoch 6/100
5154/5154 - 15s - 3ms/step - loss: 20912.0742 - mae: 115.3053 - mse: 20912.0742 - val_loss: 1127.7256 - val_mae: 27.8330 - val_mse: 1127.7256
Epoch 7/100
5154/5154 - 16s - 3ms/step - loss: 20599.8496 - mae: 114.4253 - ms

In [5]:
# Cell 5: Evaluasi model pada data uji
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Prediksi pada data test
y_pred = model.predict(X_test_prepared).flatten()

# Hitung metrik evaluasi
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Deep Learning Model Performance:')
print('MSE :', mse)
print('RMSE:', rmse)
print('MAE :', mae)
print('R²  :', r2)

[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
Deep Learning Model Performance:
MSE : 161.64715576171875
RMSE: 12.714053474864684
MAE : 9.697113990783691
R²  : -0.35820817947387695
