In [None]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv("../preprocessed-data.csv")
df

Unnamed: 0,Date,Value
0,2010-01-01,388.91
1,2010-02-01,390.41
2,2010-03-01,391.37
3,2010-04-01,392.67
4,2010-05-01,393.21
...,...,...
179,2024-12-01,425.40
180,2025-01-01,426.65
181,2025-02-01,427.09
182,2025-03-01,428.15


In [10]:
def evaluateModel(df, predCol):
    # remove rows with NaN predictions for evaluation
    validDf = df.dropna(subset=[predCol, 'Value'])
    
    if len(validDf) == 0:
        return "No valid predictions to evaluate"
    
    actual = validDf['Value']
    predicted = validDf[predCol]
    
    # calculate metrics
    mae = np.mean(np.abs(actual - predicted))
    mse = np.mean((actual - predicted) ** 2)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAPE: {mape:.4f}%"

In [11]:
def execLinearRegression(steps=12):
    dfCopy = df.copy()
    dfCopy['Date'] = pd.to_datetime(dfCopy['Date'])
    
    # create time index (months since start)
    startDate = dfCopy['Date'].min()
    dfCopy['timeIndex'] = (dfCopy['Date'] - startDate).dt.days / 30.44  # approximate months
    
    x = dfCopy['timeIndex'].values.reshape(-1, 1)
    y = dfCopy['Value'].values
    
    # fit linear regression model
    model = LinearRegression()
    model.fit(x, y)
    
    print(f"Linear Regression Model:")
    print(f"R² Score: {model.score(x, y):.6f}")
    print(f"Coefficient: {model.coef_[0]:.6f}")
    print(f"Intercept: {model.intercept_:.6f}")
    
    # make in-sample predictions
    inSamplePred = model.predict(x)
    
    resultDf = dfCopy.copy()
    resultDf['linearRegPred'] = inSamplePred
    
    # generate future predictions
    lastTimeIndex = dfCopy['timeIndex'].iloc[-1]
    lastDate = dfCopy['Date'].iloc[-1]
    
    futureDates = []
    futureValues = []
    futureLinearReg = []
    
    for i in range(steps):
        newDate = lastDate + relativedelta(months=i+1)
        newTimeIndex = lastTimeIndex + (i + 1)
        
        # predict future value
        futurePred = model.predict([[newTimeIndex]])[0]
        
        futureDates.append(newDate)
        futureValues.append(np.nan)
        futureLinearReg.append(futurePred)
    
    # create future dataframe
    futureDf = pd.DataFrame({
        'Date': futureDates,
        'Value': futureValues,
        'timeIndex': [lastTimeIndex + i + 1 for i in range(steps)],
        'linearRegPred': futureLinearReg
    })
    
    extendedDf = pd.concat([resultDf, futureDf], ignore_index=True)
    
    print("\nModel Evaluation:")
    evaluation = evaluateModel(resultDf, "linearRegPred")
    print(evaluation, "\n")
    
    # format output similar to sarima
    extendedDf['Date'] = extendedDf['Date'].dt.strftime('%Y-%m-%d')
    outputDf = extendedDf[['Date', 'Value', 'linearRegPred']].copy()
    
    print(outputDf.to_string())
    return outputDf

In [12]:
linearRegResults = execLinearRegression(12)

outPath = "../linear-reg-results.csv"
linearRegResults.to_csv(outPath, index=False)

linearRegResults

Linear Regression Model:
R² Score: 0.958588
Coefficient: 0.205238
Intercept: 388.286592

Model Evaluation:
MAE: 1.9399, MSE: 5.1333, RMSE: 2.2657, MAPE: 0.4772% 

           Date   Value  linearRegPred
0    2010-01-01  388.91     388.286592
1    2010-02-01  390.41     388.495606
2    2010-03-01  391.37     388.684392
3    2010-04-01  392.67     388.893406
4    2010-05-01  393.21     389.095677
5    2010-06-01  392.38     389.304691
6    2010-07-01  390.41     389.506962
7    2010-08-01  388.54     389.715975
8    2010-09-01  387.03     389.924989
9    2010-10-01  387.43     390.127260
10   2010-11-01  388.87     390.336274
11   2010-12-01  389.99     390.538545
12   2011-01-01  391.50     390.747558
13   2011-02-01  392.05     390.956572
14   2011-03-01  392.80     391.145358
15   2011-04-01  393.44     391.354372
16   2011-05-01  394.41     391.556643
17   2011-06-01  393.95     391.765657
18   2011-07-01  392.72     391.967928
19   2011-08-01  390.33     392.176941
20   2011-09-01  3

Unnamed: 0,Date,Value,linearRegPred
0,2010-01-01,388.91,388.286592
1,2010-02-01,390.41,388.495606
2,2010-03-01,391.37,388.684392
3,2010-04-01,392.67,388.893406
4,2010-05-01,393.21,389.095677
...,...,...,...
191,2025-12-01,,427.476768
192,2026-01-01,,427.682006
193,2026-02-01,,427.887243
194,2026-03-01,,428.092481
