In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf as _acf, pacf as _pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typing import Union, List, Dict, Any

In [None]:
df = pd.read_csv('hourly_summary.csv',parse_dates=['device_time'])

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df.corr(numeric_only=True)

# Computing Previous (Hour) Features

In [None]:
df["humidity_lag1"] = df["humidity"].shift(1)
df["humidity_lag2"] = df["humidity"].shift(2)
df["humidity_lag3"] = df["humidity"].shift(3)
df['humidity_rate_lag1'] = df['Humidity_rate'].shift(1)
df["temp_lag1"] = df["temperature"].shift(1)

In [None]:
df.corr(numeric_only=True)

In [None]:
def check_acf_pacf(
    df: pd.DataFrame,
    cols: Union[str, List[str]],
    nlags: int = 40,
    alpha: float = 0.05,
    dropna_strategy: str = "drop",   # options: "drop", "ffill", "bfill", "interpolate"
    pacf_method: str = "ywm", # options vary by statsmodels version: "ywunbiased","ywm","ols","ld"
    plot: bool = True,
    figsize: tuple = (12, 6),
    title_prefix: str = "",
    return_data: bool = True,
    acf_fft: bool = False,
    **plot_kwargs
) -> Dict[str, Any]:
    """
    Compute and optionally plot ACF and PACF for one or more series.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing time series columns.
    cols : str or list[str]
        Column name or list of column names to analyse.
    nlags : int
        Number of lags to compute/display.
    alpha : float
        Significance level for confidence intervals (e.g. 0.05 -> 95% CI).
    dropna_strategy : str
        How to handle NA before analysis: "drop", "ffill", "bfill", "interpolate".
    pacf_method : str
        Method passed to statsmodels.pacf for PACF computation.
    plot : bool
        Whether to produce ACF/PACF plots.
    figsize : tuple
        Figure size for each variable's pair of plots.
    title_prefix : str
        Optional prefix added to plot titles.
    return_data : bool
        Return ACF/PACF arrays and summary DataFrame in a dict.
    acf_fft : bool
        Whether to use FFT-based ACF calculation (faster for long series).
    plot_kwargs : dict
        Extra keyword args forwarded to plot_acf / plot_pacf (e.g., lags, zero).
    
    Returns
    -------
    result : dict
        Keys are column names. Each value is a dict with:
          - 'series' : the cleaned pandas.Series used
          - 'acf'    : numpy array of acf values (length nlags+1)
          - 'pacf'   : numpy array of pacf values (length nlags+1)
          - 'acf_ci' : conf interval array (2 x (nlags+1)) if available
          - 'pacf_ci': conf interval for pacf (None if not available)
          - 'table'  : pd.DataFrame with columns ['lag','acf','pacf']
    """

    # Normalize cols to list
    if isinstance(cols, str):
        cols = [cols]

    # validate dropna strategy
    valid_strats = {"drop", "ffill", "bfill", "interpolate"}
    if dropna_strategy not in valid_strats:
        raise ValueError(f"dropna_strategy must be one of {valid_strats}")

    results = {}

    for col in cols:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found in dataframe")

        s = df[col]

        # Ensure numeric
        s = pd.to_numeric(s, errors="coerce")

        # Handle missing values
        if dropna_strategy == "drop":
            s_clean = s.dropna()
        elif dropna_strategy == "ffill":
            s_clean = s.fillna(method="ffill").dropna()
        elif dropna_strategy == "bfill":
            s_clean = s.fillna(method="bfill").dropna()
        elif dropna_strategy == "interpolate":
            s_clean = s.interpolate().dropna()
        else:
            s_clean = s.dropna()

        if len(s_clean) < 5:
            raise ValueError(f"Column {col} has too few non-NA samples after cleaning ({len(s_clean)}).")

        # Compute ACF and PACF using statsmodels
        # _acf returns (acf_vals, confint) if alpha provided; set fft param for performance
        acf_vals, acf_ci = _acf(s_clean, nlags=nlags, alpha=alpha, fft=acf_fft)
        # pacf: statsmodels.pacf returns pacf array; conf int is not always returned by pacf function,
        # so we compute pacf alone. Use method parameter to choose algorithm.
        pacf_vals = _pacf(s_clean, nlags=nlags, method=pacf_method)

        # Create table for quick review
        lags = np.arange(len(acf_vals))
        table = pd.DataFrame({
            "lag": lags,
            "acf": acf_vals,
            "pacf": pacf_vals
        })

        results[col] = {
            "series": s_clean,
            "acf": acf_vals,
            "acf_ci": acf_ci,
            "pacf": pacf_vals,
            "pacf_ci": None,
            "table": table
        }

        # Plotting
        if plot:
            fig, axes = plt.subplots(1, 2, figsize=figsize)
            # ACF plot (uses statsmodels plot_acf for correct CI ribbons)
            plot_acf(
                s_clean,
                ax=axes[0],
                lags=nlags,
                alpha=alpha,
                fft=acf_fft,
                **plot_kwargs
            )
            axes[0].set_title(f"{title_prefix}{col} — ACF")

            # PACF plot
            plot_pacf(
                s_clean,
                ax=axes[1],
                lags=nlags,
                alpha=alpha,
                method=pacf_method,
                **plot_kwargs
            )
            axes[1].set_title(f"{title_prefix}{col} — PACF")

            plt.tight_layout()
            plt.show()

    if return_data:
        return results
    else:
        return None

In [None]:
res = check_acf_pacf(df, cols='humidity', nlags=3)

In [None]:
res = check_acf_pacf(df, cols='temperature', nlags=3)

In [None]:
def check_stationarity_adf(
    series: Union[pd.Series, pd.DataFrame],
    col: str = None,
    alpha: float = 0.05,
    print_result: bool = True
):
    """
    Perform the Augmented Dickey-Fuller (ADF) stationarity test and interpret results.

    Parameters
    ----------
    series : pd.Series or pd.DataFrame
        The input time series or dataframe containing the series.
    col : str, optional
        Column name if a DataFrame is passed. Not needed if a Series is passed.
    alpha : float
        Significance level. Default is 0.05 (95% confidence).
    print_result : bool
        Whether to print the interpretation to console.

    Returns
    -------
    Dict[str, Any]
        Dictionary containing ADF statistics, p-value, critical values, and stationarity flag.
    """

    # Extract series
    if isinstance(series, pd.DataFrame):
        if col is None:
            raise ValueError("For DataFrame input, specify the 'col' argument.")
        x = series[col].dropna()
        name = col
    else:
        x = series.dropna()
        name = series.name if series.name else "series"

    # Run ADF test
    result = adfuller(x, autolag="AIC")

    adf_stat = result[0]
    p_value = result[1]
    used_lags = result[2]
    n_obs = result[3]
    critical_values = result[4]

    # Determine stationarity
    is_stationary = p_value < alpha

    if print_result:
        print(f"\nADF Stationarity Test for '{name}':")
        print("---------------------------------------")
        print(f"ADF Statistic : {adf_stat:.4f}")
        print(f"p-value       : {p_value:.4f}")
        print(f"Used lags     : {used_lags}")
        print(f"Observations  : {n_obs}")
        print("Critical Values:")
        for k, v in critical_values.items():
            print(f"   {k}: {v:.4f}")

        # Interpretation
        print("\nInterpretation:")
        if is_stationary:
            print(f"✔ The series **IS STATIONARY** at alpha = {alpha}.")
            print("  → Reject the null hypothesis (unit root).")
            print("  → Mean/variance/autocorrelation are stable.")
        else:
            print(f"✖ The series **IS NOT STATIONARY** at alpha = {alpha}.")
            print("  → Cannot reject the null hypothesis.")
            print("  → Series likely has trend, seasonality, or non-constant variance.")

    return {
        "adf_statistic": adf_stat,
        "p_value": p_value,
        "used_lags": used_lags,
        "n_obs": n_obs,
        "critical_values": critical_values,
        "is_stationary": is_stationary
    }

In [None]:
check_stationarity_adf(df['temperature'])


In [None]:
check_stationarity_adf(df['humidity'])

In [None]:
training_data = df[['temp_lag1','humidity','humidity_lag3','hour','temperature']]

In [None]:
training_data = training_data.dropna()

In [None]:
training_data.corr()

In [None]:
def get_distribution_type(data: pd.DataFrame, feature: str, **kwargs):
    """
    Plot the distribution of a given feature using seaborn.displot.
    
    Parameters
    ----------
    data : pd.DataFrame
        The dataframe containing the feature.
    feature : str
        Column name to plot.
    **kwargs :
        Additional keyword arguments passed to sns.displot().
    """
    try:
        sns.displot(data[feature], **kwargs)
        plt.show()
    except Exception as e:
        print(f"Error plotting feature '{feature}': {e}")
        raise

def plot_relationship(data: pd.DataFrame,col1: str,col2: str,**kwargs):
    """
    Scatter plot showing the relationship between temperature and humidity.

    Parameters
    ----------
    data : pd.DataFrame
        The dataframe containing temperature and humidity columns.
    col1 : str, default='temperature'
        The column name for temperature values.
    col2 : str, default='humidity'
        The column name for humidity values.
    **kwargs :
        Additional keyword arguments passed to plt.scatter().
        Examples: color='red', alpha=0.5, s=20, marker='x'
    """
    try:
        # Default transparency if not given


        plt.figure(figsize=kwargs.pop('figsize', (6, 4)))
        plt.scatter(data[col1], data[col2], **kwargs)
        plt.xlabel(f"{col1.capitalize()} (°C)")
        plt.ylabel(f"{col2.capitalize()} (%)")
        plt.title(f"{col1.capitalize()} vs {col2.capitalize()}")
        plt.grid(True, linestyle='--', alpha=0.4)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Error plotting {col1} vs {col2}: {e}")
        raise

In [None]:
plot_relationship(data=training_data, col1='temp_lag1', col2='humidity_lag3')

In [None]:
x = training_data.drop('humidity',axis=1)
y = training_data['humidity']

In [None]:
x.columns

In [None]:
train_x, test_x, train_y, test_y = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
model = RandomForestRegressor()
model.fit(train_x,train_y)


In [None]:
result_train = model.predict(train_x)

In [None]:
result_test = model.predict(test_x)

In [None]:
print(r2_score(train_y, result_train))

In [None]:
print(r2_score(test_y, result_test))

In [None]:
plt.scatter(test_y, result_test)

m, b = np.polyfit(test_y, result_test, 1)   # slope (m) and intercept (b)
x_line = np.linspace(min(test_y), max(test_y), 100)
y_line = m * x_line + b

# Plot the best-fit line
plt.plot(x_line, y_line, linewidth=2, color='red')

plt.xlabel("Actual Humidity")
plt.ylabel("Predicted Humidity")
plt.title("Actual vs Predicted Humidity (Test Set)")
plt.grid(alpha=0.3)
plt.show()

In [None]:
plt.scatter(train_y, result_train)

In [None]:
training_data.head()

In [None]:
training_data

In [None]:
x.shape

In [None]:
train_x.shape

In [None]:
test_x.shape

In [None]:
df.shape