<a href="https://colab.research.google.com/github/yogasgm/data-science-machine-learning-BI/blob/main/Introduction_to_Nowcasting_DS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [23]:
data = pd.read_csv("https://raw.githubusercontent.com/yogasgm/data-science-machine-learning-BI/refs/heads/main/dataset/synthetic_nowcasting_data.csv")
data

Unnamed: 0,Date,GDP_Growth,Unemployment_Rate,Exchange_Rate,Interest_Rate,Money_Supply,Consumer_Confidence_Index,Inflation_Rate
0,2015-01-01,1.00,6.90,13000.00,4.06,98.41,106.83,2.10
1,2015-02-01,0.36,6.36,13000.00,4.50,100.19,95.16,2.79
2,2015-03-01,1.15,7.13,13164.60,3.65,101.85,98.61,2.13
3,2015-04-01,2.02,5.29,13179.05,4.56,102.99,118.97,2.50
4,2015-05-01,0.27,7.16,13000.00,4.05,99.28,101.70,2.27
...,...,...,...,...,...,...,...,...
115,2024-08-01,0.80,6.92,14940.65,8.35,146.31,105.43,6.51
116,2024-09-01,0.47,5.75,14888.79,9.08,148.70,113.81,7.16
117,2024-10-01,-0.67,7.43,14922.12,8.72,148.58,90.38,7.01
118,2024-11-01,1.64,5.85,14782.03,10.43,150.23,116.59,8.56


In [24]:
# Add lag features
def create_lag_features(df, lag=1):
    df_copy = df.copy()
    for col in df.columns:
        if col != 'Date':
            df_copy[col + f'_lag{lag}'] = df_copy[col].shift(lag)
    return df_copy

# Create lag features
data_lagged = create_lag_features(data, lag=1)
data_lagged

Unnamed: 0,Date,GDP_Growth,Unemployment_Rate,Exchange_Rate,Interest_Rate,Money_Supply,Consumer_Confidence_Index,Inflation_Rate,GDP_Growth_lag1,Unemployment_Rate_lag1,Exchange_Rate_lag1,Interest_Rate_lag1,Money_Supply_lag1,Consumer_Confidence_Index_lag1,Inflation_Rate_lag1
0,2015-01-01,1.00,6.90,13000.00,4.06,98.41,106.83,2.10,,,,,,,
1,2015-02-01,0.36,6.36,13000.00,4.50,100.19,95.16,2.79,1.00,6.90,13000.00,4.06,98.41,106.83,2.10
2,2015-03-01,1.15,7.13,13164.60,3.65,101.85,98.61,2.13,0.36,6.36,13000.00,4.50,100.19,95.16,2.79
3,2015-04-01,2.02,5.29,13179.05,4.56,102.99,118.97,2.50,1.15,7.13,13164.60,3.65,101.85,98.61,2.13
4,2015-05-01,0.27,7.16,13000.00,4.05,99.28,101.70,2.27,2.02,5.29,13179.05,4.56,102.99,118.97,2.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2024-08-01,0.80,6.92,14940.65,8.35,146.31,105.43,6.51,0.31,7.92,14747.31,9.92,147.86,98.61,7.61
116,2024-09-01,0.47,5.75,14888.79,9.08,148.70,113.81,7.16,0.80,6.92,14940.65,8.35,146.31,105.43,6.51
117,2024-10-01,-0.67,7.43,14922.12,8.72,148.58,90.38,7.01,0.47,5.75,14888.79,9.08,148.70,113.81,7.16
118,2024-11-01,1.64,5.85,14782.03,10.43,150.23,116.59,8.56,-0.67,7.43,14922.12,8.72,148.58,90.38,7.01


In [25]:
# Drop rows with NaN values due to lag
data_lagged = data_lagged.dropna()
data_lagged

Unnamed: 0,Date,GDP_Growth,Unemployment_Rate,Exchange_Rate,Interest_Rate,Money_Supply,Consumer_Confidence_Index,Inflation_Rate,GDP_Growth_lag1,Unemployment_Rate_lag1,Exchange_Rate_lag1,Interest_Rate_lag1,Money_Supply_lag1,Consumer_Confidence_Index_lag1,Inflation_Rate_lag1
1,2015-02-01,0.36,6.36,13000.00,4.50,100.19,95.16,2.79,1.00,6.90,13000.00,4.06,98.41,106.83,2.10
2,2015-03-01,1.15,7.13,13164.60,3.65,101.85,98.61,2.13,0.36,6.36,13000.00,4.50,100.19,95.16,2.79
3,2015-04-01,2.02,5.29,13179.05,4.56,102.99,118.97,2.50,1.15,7.13,13164.60,3.65,101.85,98.61,2.13
4,2015-05-01,0.27,7.16,13000.00,4.05,99.28,101.70,2.27,2.02,5.29,13179.05,4.56,102.99,118.97,2.50
5,2015-06-01,0.27,7.96,13257.68,3.52,101.43,97.52,1.94,0.27,7.16,13000.00,4.05,99.28,101.70,2.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2024-08-01,0.80,6.92,14940.65,8.35,146.31,105.43,6.51,0.31,7.92,14747.31,9.92,147.86,98.61,7.61
116,2024-09-01,0.47,5.75,14888.79,9.08,148.70,113.81,7.16,0.80,6.92,14940.65,8.35,146.31,105.43,6.51
117,2024-10-01,-0.67,7.43,14922.12,8.72,148.58,90.38,7.01,0.47,5.75,14888.79,9.08,148.70,113.81,7.16
118,2024-11-01,1.64,5.85,14782.03,10.43,150.23,116.59,8.56,-0.67,7.43,14922.12,8.72,148.58,90.38,7.01


In [26]:
# Recreate the feature set using only the lagged variables
X_lagged = data_lagged[['GDP_Growth_lag1', 'Unemployment_Rate_lag1',
                        'Exchange_Rate_lag1', 'Interest_Rate_lag1',
                        'Money_Supply_lag1', 'Consumer_Confidence_Index_lag1']]
y_lagged = data_lagged['Inflation_Rate']

# Re-initialize the results dictionary
results_corrected = {
    'Model': [],
    'RMSE': [],
    'R2': [],
    'MAE': [],
}

# Iterate over each model
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Iterate over each model with corrected features
for model_name, model in models.items():
    # Time series split
    for train_index, test_index in tscv.split(X_lagged):
        X_train, X_test = X_lagged.iloc[train_index], X_lagged.iloc[test_index]
        y_train, y_test = y_lagged.iloc[train_index], y_lagged.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        # Store results
        results_corrected['Model'].append(model_name)
        results_corrected['RMSE'].append(rmse)
        results_corrected['R2'].append(r2)
        results_corrected['MAE'].append(mae)

# Convert results to DataFrame
results_corrected_df = pd.DataFrame(results_corrected)
results_corrected_df

Unnamed: 0,Model,RMSE,R2,MAE
0,Linear Regression,0.925949,-2.821121,0.821426
1,Linear Regression,0.682762,-0.101757,0.563965
2,Linear Regression,0.594314,0.024689,0.508533
3,Linear Regression,0.616933,-0.242167,0.49642
4,Linear Regression,0.627838,0.117371,0.448031
5,Random Forest,1.056323,-3.97291,0.901395
6,Random Forest,0.87015,-0.789518,0.710542
7,Random Forest,0.721304,-0.436641,0.593958
8,Random Forest,0.666596,-0.450205,0.551516
9,Random Forest,1.144653,-1.933795,0.902363


In [27]:
# Calculate the average of the evaluation metrics
average_results = results_corrected_df.groupby('Model').mean().reset_index()

# Display the average results
average_results

Unnamed: 0,Model,RMSE,R2,MAE
0,Linear Regression,0.689559,-0.604597,0.567675
1,Random Forest,0.891805,-1.516614,0.731955


In [28]:
# Train the final models on the entire dataset with only lagged features
final_models_corrected = {}
for model_name, model in models.items():
    model.fit(X_lagged, y_lagged)
    final_models_corrected[model_name] = model

In [29]:
december_2024_data = data.iloc[-1][['GDP_Growth', 'Unemployment_Rate',
                                'Exchange_Rate', 'Interest_Rate',
                                'Money_Supply', 'Consumer_Confidence_Index']]

# Convert the December 2024 data to the format expected by the model (single row DataFrame)
december_2024_data_lagged = pd.DataFrame([december_2024_data.values], columns=X_lagged.columns)
december_2024_data_lagged

Unnamed: 0,GDP_Growth_lag1,Unemployment_Rate_lag1,Exchange_Rate_lag1,Interest_Rate_lag1,Money_Supply_lag1,Consumer_Confidence_Index_lag1
0,1.25,6.8,14810.83,9.51,148.35,105.79


In [30]:
# Use the trained models to predict the inflation rate for August 2024
august_predictions_corrected = {model_name: model.predict(december_2024_data_lagged)[0] for model_name, model in final_models_corrected.items()}
august_predictions_corrected

{'Linear Regression': 6.922872648159724, 'Random Forest': 7.336799999999995}