In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
%matplotlib inline

In [2]:
df = pd.read_csv("Weather_data.csv")
print(df.shape)
df.head()

(114, 2)


Unnamed: 0,Date,Temperature
0,2017-01-01,15.913043
1,2017-01-02,18.5
2,2017-01-03,17.111111
3,2017-01-04,18.7
4,2017-01-05,18.388889


In [3]:
df.set_index('Date', inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,Temperature
Date,Unnamed: 1_level_1
2017-01-01,15.913043
2017-01-02,18.5
2017-01-03,17.111111
2017-01-04,18.7
2017-01-05,18.388889


In [5]:
fig = go.Figure()
# Pass the actual index and values from the DataFrame to the scatter plot
fig.add_trace(go.Scatter(x = df.index, y = df.values.flatten(),
                         mode = 'lines',
                         line = dict(color = 'deepskyblue', width = 3)))
fig.update_xaxes(
    rangeslider_visible = True,
    showgrid = True,
    gridcolor = 'lightgray')

fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray'
)

fig.update_layout(
                  title='Temperature over the time',
                  title_font_size=24,
                  xaxis_title='Date',
                  yaxis_title='Temperature',
                  height=700,
                  plot_bgcolor='#1f1f2e',
                  paper_bgcolor='#dceeff',
                  hovermode='x unified',
                  font = dict(family = 'Arial', size = 14),
                  margin = dict(l = 60, r = 60, t = 80, b = 60))
fig.show()

In [6]:
def check_stationarity(data):
  result = adfuller(data)
  p_value = result[1]
  if p_value > 0.05:
    conclusion = "Fail to reject H0, that means that the given data is not stationary"
  else:
    conclusion = "Reject H0, that means that the given data is stationary"
  return p_value, conclusion

In [7]:
result = check_stationarity(df)
p_value = result[0]
print(result[1])

Fail to reject H0, that means that the given data is not stationary


In [8]:
def difference_order(data):
  p_value = check_stationarity(data)[0]
  d = 0
  while True:
    p_value > 0.05
    d += 1
    data = data.diff().dropna()
    p_value = check_stationarity(data)[0]
    if p_value <= 0.05:
      break
  return d

In [9]:
d = difference_order(df)
print(d)

1


In [10]:
df_diff = df.diff().dropna()

In [11]:
temp = df_diff['Temperature']
acf_vals = acf(temp, nlags=20)
pacf_vals = pacf(temp, nlags=20)

lags = list(range(len(acf_vals)))

In [12]:
fig = go.Figure()

fig.add_trace(go.Bar(x=lags, y=acf_vals, name='ACF', marker_color='lightblue'))

fig.update_layout(
    title='Autocorrelation Function (ACF)',
    xaxis_title='Lag',
    yaxis_title='Correlation',
    height=500,
    width = 600,
    plot_bgcolor='#1f1f2e',
    showlegend=False
)

fig.show()


In [13]:
fig = go.Figure()

fig.add_trace(go.Bar(x=lags, y=pacf_vals, name='PACF', marker_color='lightblue'))

fig.update_layout(
    title='Partial Autocorrelation Function (ACF)',
    xaxis_title='Lag',
    yaxis_title='Correlation',
    height=500,
    width = 600,
    plot_bgcolor='#1f1f2e',
    showlegend=False
)

fig.show()

In [14]:
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
    x=df_diff.index,
    y=df_diff.values.flatten(),
    mode='lines',
    line=dict(color='seagreen', width=3)
))
fig2.update_xaxes(
    rangeslider_visible=True,
    showgrid=True,
    gridcolor='lightgray'
)
fig2.update_yaxes(
    showgrid=True,
    gridcolor='lightgray'
)
fig2.update_layout(
    height=700,
    width = 800,
    plot_bgcolor='#1f1f2e',
    paper_bgcolor='lightblue',
    margin=dict(l=60, r=60, t=80, b=60)
)
fig2.show()


# Tuning

In [15]:
series = df['Temperature']

In [16]:
# Train/test split (80/20)
split_idx = int(len(series) * 0.8)
train, test = series[:split_idx], series[split_idx:]


In [17]:
import optuna
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, r2_score


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [95]:
def create_arima_model(p, d, q, df=train):
    return ARIMA(df, order=(p, d, q))

In [96]:
def objective(trial):
    p = trial.suggest_int('p', 0, 20)
    d = trial.suggest_int('d', 0, 10)
    q = trial.suggest_int('q', 0, 20)

    try:
        model = create_arima_model(p, d, q, train)
        model_fit = model.fit()  # <- Parentheses here are important
        forecast = model_fit.forecast(steps=len(test))
        error = mean_squared_error(test, forecast)
        return error
    except Exception as e:
        print(f"Trial failed with parameters p={p}, d={d}, q={q} because of {e}")
        return float('inf')  # Return a high error if fitting fails


In [97]:
# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2025-06-03 18:08:37,861] A new study created in memory with name: no-name-0bd47d65-202d-41e8-87a6-86daea55e96a

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals

[I 2025-06-03 18:08:38,331] Trial 0 finished with value: 30.86184408421276 and parameters: {'p': 12, 'd': 0, 'q': 10}. Best is trial 0 with value: 30.86184408421276.

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be use

In [98]:
# Best parameters
print("Best ARIMA order:", study.best_params)
print("Best MSE is ", study.best_value)

Best ARIMA order: {'p': 9, 'd': 1, 'q': 17}
Best MSE is  2.3178754171269706


# Model Builing

In [111]:
# Fit the ARMA(1, 1) model
model = create_arima_model(p = 9, d = 1, q = 17)
model_fit = model.fit()

# Print the model summary
print(model_fit.summary())


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.



                               SARIMAX Results                                
Dep. Variable:            Temperature   No. Observations:                   91
Model:                ARIMA(9, 1, 17)   Log Likelihood                -170.885
Date:                Tue, 03 Jun 2025   AIC                            395.771
Time:                        18:10:44   BIC                            463.265
Sample:                    01-01-2017   HQIC                           422.988
                         - 04-01-2017                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2405     28.945     -0.008      0.993     -56.972      56.491
ar.L2          0.3037     13.268      0.023      0.982     -25.701      26.308
ar.L3          0.1801      5.711      0.032      0.9


Maximum Likelihood optimization failed to converge. Check mle_retvals



In [112]:
forecast = model_fit.forecast(steps=len(test))
error = mean_squared_error(test, forecast)

In [113]:
r2_score(test, forecast)

0.5895094111480824

In [114]:
error

2.3178754171269706

In [123]:
start = len(df) + 1
start + 20

135

In [136]:
# Make predictions
start = len(df) 
end = start + 20
predictions = model_fit.predict(start=start, end=end)


In [137]:
last_date = df.index[-1]
forecast_index = pd.date_range(start=last_date, periods=len(predictions) + 1, freq='D')[1:]


In [138]:
if len(forecast_index) != len(predictions):
    # This might happen if the `predict` method's start/end index calculation is complex.
    # A safer approach is to generate an index of the same length as the predictions,
    # starting from the point after the last observation.
    forecast_index = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(predictions), freq='D') # Assuming daily freq

predictions.index = forecast_index


In [139]:
fig3 = go.Figure()
fig3.add_trace(go.Scatter(
    x=df.index,
    y=df.values.flatten(),
    mode='lines',
    line=dict(color='seagreen', width=3)
))
fig3.add_trace(go.Scatter(
    x=predictions.index,
    y=predictions.values.flatten(),
    mode='lines',
    line=dict(color='red', width=3)
))
fig3.update_xaxes(
    rangeslider_visible=True,
    showgrid=True,
    gridcolor='lightgray'
)
fig3.update_yaxes(
    showgrid=True,
    gridcolor='lightgray'
)
fig3.update_layout(
    title = 'Prediction of later 20 days Temperature',
    title_font_size = 24,
    xaxis_title = 'Date',
    yaxis_title = 'Temperature',
    height=700,
    width = 800,
    plot_bgcolor='#1f1f2e',
    paper_bgcolor='lightblue',
    margin=dict(l=60, r=60, t=80, b=60)
)
fig3.show()
