<a href="https://colab.research.google.com/github/apriandito/dkem/blob/main/Introduction_to_Nowcasting_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/apriandito/dkem/main/data/dummy_macroeconomic_data.csv")
data

In [None]:
data.tail()

In [None]:
# Add lag features
def create_lag_features(df, lag=1):
    df_copy = df.copy()
    for col in df.columns:
        if col != 'Date':
            df_copy[col + f'_lag{lag}'] = df_copy[col].shift(lag)
    return df_copy

# Create lag features
data_lagged = create_lag_features(data, lag=1)
data_lagged

In [None]:
# Drop rows with NaN values due to lag
data_lagged = data_lagged.dropna()
data_lagged

In [None]:
# Recreate the feature set using only the lagged variables
X_lagged = data_lagged[['GDP_Growth_lag1', 'Unemployment_Rate_lag1',
                        'Exchange_Rate_lag1', 'Interest_Rate_lag1',
                        'Money_Supply_lag1', 'Consumer_Confidence_Index_lag1']]
y_lagged = data_lagged['Inflation_Rate']

# Re-initialize the results dictionary
results_corrected = {
    'Model': [],
    'RMSE': [],
    'R2': [],
    'MAE': [],
}

# Iterate over each model
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Iterate over each model with corrected features
for model_name, model in models.items():
    # Time series split
    for train_index, test_index in tscv.split(X_lagged):
        X_train, X_test = X_lagged.iloc[train_index], X_lagged.iloc[test_index]
        y_train, y_test = y_lagged.iloc[train_index], y_lagged.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        # Store results
        results_corrected['Model'].append(model_name)
        results_corrected['RMSE'].append(rmse)
        results_corrected['R2'].append(r2)
        results_corrected['MAE'].append(mae)

# Convert results to DataFrame
results_corrected_df = pd.DataFrame(results_corrected)
results_corrected_df

In [None]:
# Calculate the average of the evaluation metrics
average_results = results_corrected_df.groupby('Model').mean().reset_index()

# Display the average results
average_results

In [None]:
# Train the final models on the entire dataset with only lagged features
final_models_corrected = {}
for model_name, model in models.items():
    model.fit(X_lagged, y_lagged)
    final_models_corrected[model_name] = model

In [None]:
july_2024_data = data.iloc[-1][['GDP_Growth', 'Unemployment_Rate',
                                'Exchange_Rate', 'Interest_Rate',
                                'Money_Supply', 'Consumer_Confidence_Index']]

# Convert the July 2024 data to the format expected by the model (single row DataFrame)
july_2024_data_lagged = pd.DataFrame([july_2024_data.values], columns=X_lagged.columns)
july_2024_data_lagged

In [None]:
# Use the trained models to predict the inflation rate for August 2024
august_predictions_corrected = {model_name: model.predict(july_2024_data_lagged)[0] for model_name, model in final_models_corrected.items()}
august_predictions_corrected