In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.interpolate import interp1d
import statsmodels.api as sm
import os
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.utils import pad_sequences


In [None]:
#cell to import additional libraries or define helper functions
from google.colab import files
# Setting seed for reproducibility
np.random.seed(1234)
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.model_selection import train_test_split
import zipfile
import io

In [None]:
#file upload
uploaded = files.upload()

Above is csv file from UCI Repo "[Appliances Energy Prediction](https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction)" and manually added descriptions from [git link](https://github.com/LuisM78/Appliances-energy-prediction-data/blob/master/variables%20description.txt)

Location: Chièvres
7950 Chievres
Belgium


In [None]:
# List of the uploaded files
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')

In [None]:
#Above is csv file from UCI Repo and manually added descriptions from

# Read the uploaded file into a DataFrame
df_raw = pd.read_csv('energydata_transformed.csv', encoding='ISO-8859-1')

# Display the first few rows of the DataFrame
print(df_raw.head())


In [None]:
# Creating a copy to preserve the original format
df = df_raw.copy()
df.head()

In [None]:

def _plot_series(series, series_name, series_index=0):
  palette = list(sns.palettes.mpl_palette('Dark2'))
  xs = series['Timestamp']
  ys = series['Office Room °C']

  plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = _df_12.sort_values('Timestamp', ascending=True)
for i, (series_name, series) in enumerate(df_sorted.groupby('Timestamp')):
  _plot_series(series, series_name, i)
  fig.legend(title='Timestamp', bbox_to_anchor=(1, 1), loc='upper left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('Timestamp')
_ = plt.ylabel('Office Room °C')

In [None]:
_df_8.plot(kind='scatter', x='Living Room RH %', y='Laundry RH %', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:

_df_3['Living Room RH %'].plot(kind='hist', bins=20, title='Living Room RH %')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
# Convert the 'Timestamp' column to datetime with the correct format and dayfirst=True
df['TS'] = pd.to_datetime(df['Timestamp'], format='%m/%d/%Y %H:%M', dayfirst=True)

# Add 'day of week', 'hour', 'month', and 'year' columns
df['Day of Week'] = df['TS'].dt.strftime('%a')
df['Hour'] = df['TS'].dt.hour
df['Month'] = df['TS'].dt.month
df['Year'] = df['TS'].dt.year

# Display the first few rows of the updated DataFrame
print(df.head())

Data consistency check

In [None]:
df.dtypes

In [None]:
df.isna().sum().plot.bar()

In [None]:
# Create a figure with two subplots
fig, axes = plt.subplots(1,3 , figsize=(16, 6))

# Trend of Appliances energy Kwh over Kitchen Temp °C
sns.lineplot(ax=axes[0], x='Kitchen Temp °C', y='Appliances energy Kwh', data=df)
axes[0].set_title('Appliances Energy Kwh vs. Kitchen Temp °C')

# Trend of Lights energy Kwh over Kitchen Temp °C
sns.lineplot(ax=axes[0], x='Kitchen Temp °C', y='Lights energy Kwh', data=df)
axes[0].set_title('Lights Energy Kwh vs. Kitchen Temp °C')

# Trend of Appliances energy Kwh over Hour
sns.lineplot(ax=axes[1], x='Hour', y='Appliances energy Kwh', data=df)
axes[1].set_title('Appliances Energy Kwh vs. Hour of the Day')

# Trend of Lights energy Kwh over Hour
sns.lineplot(ax=axes[1], x='Hour', y='Lights energy Kwh', data=df)
axes[1].set_title('Lights Energy Kwh vs. Hour of the Day')


# Trend of Appliances energy Kwh over Day of Week
sns.lineplot(ax=axes[2], x='Day of Week', y='Appliances energy Kwh', data=df)
axes[2].set_title('Appliances Energy Kwh vs. Day of week')

# Trend of Lights energy Kwh over Day of Week
sns.lineplot(ax=axes[2], x='Day of Week', y='Lights energy Kwh', data=df)
axes[2].set_title('Lights Energy Kwh vs. Day of Week')


# Show the plots
plt.tight_layout()
plt.show()


In [None]:
# Set the 'Timestamp' column as the index
df.set_index('TS', inplace=True)

# Select only numeric columns for the correlation matrix and pairplot
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Compute the correlation matrix
corr_matrix = df[numeric_cols].corr()

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Plot the pairplot (scatter plots)
sns.pairplot(df[numeric_cols], diag_kind='kde', plot_kws={'alpha':0.5, 's':30})
plt.show()

In [None]:
# Convert 'TS' column to datetime
# df['TS'] = pd.to_datetime(df['TS']) #This line causes error since TS is in index, not column

# Reset the index to make 'TS' a column again
df = df.reset_index()

df['TS'] = pd.to_datetime(df['TS'])

# Set the 'TS' column as the index
df.set_index('TS', inplace=True)

# Plot the trend chart
plt.figure(figsize=(12, 6))
plt.plot(df['Appliances energy Kwh'], label='Appliances energy Kwh')
plt.plot(df['Lights energy Kwh'], label='Lights energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Energy Kwh')
plt.title('Trend Chart of Appliances and Lights Energy Kwh vs TS')
plt.legend()
plt.show()

In [None]:
# Filter the dataset for January
january_df = df[df.index.month == 1]

# Plot the trend chart for January
plt.figure(figsize=(12, 6))
plt.plot(january_df['Appliances energy Kwh'], label='Appliances energy Kwh')
plt.plot(january_df['Lights energy Kwh'], label='Lights energy Kwh')
plt.plot(january_df['Outside Temperature  °C'], label='Outside Temperature', linestyle='--')
plt.plot(january_df['Outside RH%'], label='Outside RH%', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('Values')
plt.title('Trend Chart of Energy Usage and Outside Conditions vs TS (January)')
plt.legend()
plt.show()

In [None]:
# Filter the dataset for June
may_df = df[df.index.month == 5]

# Plot the trend chart for May
plt.figure(figsize=(12, 6))
plt.plot(may_df['Appliances energy Kwh'], label='Appliances energy Kwh')
plt.plot(may_df['Lights energy Kwh'], label='Lights energy Kwh')
plt.plot(may_df['Outside Temperature  °C'], label='Outside Temperature', linestyle='--')
plt.plot(may_df['Outside RH%'], label='Outside RH%', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('Values')
plt.title('Trend Chart of Energy Usage and Outside Conditions vs TS (May)')
plt.legend()
plt.show()


In [None]:
# Convert 'TS' column to datetime
# df['TS'] = pd.to_datetime(df['TS']) #This line causes error since TS is in index, not column

# Reset the index to make 'TS' a column again
df = df.reset_index()

# Now you can access 'TS' as a column
df['TS'] = pd.to_datetime(df['TS'])

# Extract the time part from 'TS' and create a new column 'Time'
df['Time'] = df['TS'].dt.time

# Display the dataframe with the new 'Time' column
print(df.head())

df.Time

In [None]:
#Length of dataframe
len(df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Reset the index to make 'TS' a column again
df = df.reset_index()

# Now you can access 'TS' as a column
df['TS'] = pd.to_datetime(df['TS'])

# Set the 'TS' column as the index
df.set_index('TS', inplace=True)

# Select relevant columns for the model
data = df[['Outside Temperature °C', 'Appliances energy Kwh']]

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the data for LSTM
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 1])
    return np.array(X), np.array(y)

# Create the dataset with a time step of 24 (assuming hourly data)
time_step = 24
X, y = create_dataset(scaled_data, time_step)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], X.shape[1], 1)

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(y)]

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Make predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# Inverse transform the predictions and actual values
train_predict = scaler.inverse_transform(np.concatenate((np.zeros((train_predict.shape[0], 1)), train_predict), axis=1))[:, 1]
test_predict = scaler.inverse_transform(np.concatenate((np.zeros((test_predict.shape[0], 1)), test_predict), axis=1))[:, 1]
y_train = scaler.inverse_transform(np.concatenate((np.zeros((len(y_train), 1)), y_train.reshape(-1, 1)), axis=1))[:, 1]
y_test = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), 1)), y_test.reshape(-1, 1)), axis=1))[:, 1]

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(df.index[-len(y_test):], y_test, label='Actual Appliances energy Kwh')
plt.plot(df.index[-len(y_test):], test_predict, label='Predicted Appliances energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Appliances energy Kwh')
plt.title('Actual vs Predicted Appliances energy Kwh')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Reset the index to make 'TS' a column again if it's currently the index
if df.index.name == 'TS':
    df = df.reset_index()

# Ensure 'TS' is a datetime column
df['TS'] = pd.to_datetime(df['TS'])

# Set the 'TS' column as the index (if desired)
df.set_index('TS', inplace=True)

# Now you can access columns by name
data = df[['Outside Temperature  °C', 'Appliances energy Kwh']]
# Note: Corrected column name

# ... rest of your code ...

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the data for LSTM
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 1])
    return np.array(X), np.array(y)

# Create the dataset with a time step of 24 (assuming hourly data)
time_step = 24
X, y = create_dataset(scaled_data, time_step)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], X.shape[1], 1)

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(y)]

# LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Make predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# Inverse transform the predictions and actual values
train_predict = scaler.inverse_transform(np.concatenate((np.zeros((train_predict.shape[0], 1)), train_predict), axis=1))[:, 1]
test_predict = scaler.inverse_transform(np.concatenate((np.zeros((test_predict.shape[0], 1)), test_predict), axis=1))[:, 1]
y_train = scaler.inverse_transform(np.concatenate((np.zeros((len(y_train), 1)), y_train.reshape(-1, 1)), axis=1))[:, 1]
y_test = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), 1)), y_test.reshape(-1, 1)), axis=1))[:, 1]

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(df.index[-len(y_test):], y_test, label='Actual Appliances energy Kwh')
plt.plot(df.index[-len(y_test):], test_predict, label='Predicted Appliances energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Appliances energy Kwh')
plt.title('Actual vs Predicted Appliances energy Kwh')
plt.legend()
plt.show()

In [None]:
# summarize history for Loss/MSE
fig_acc = plt.figure(figsize=(10, 10))
# Assign the output of model.fit to history
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss/MSE')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("LSTM_loss1.png")

**Observation**:

Initially, both the training and testing losses are high, with the training loss starting above 16000 and the testing loss starting above 11000. As the number of epochs increases, both losses decrease rapidly at first and then level off. The training loss stabilizes around 11000, while the testing loss stabilizes around 8000.

This graph indicates that the model's performance improves over time, as evidenced by the decreasing loss values. However, the training loss remains higher than the testing loss, which could suggest that the model is overfitting to the training data. The testing loss stabilizing at a lower value indicates that the model generalizes better to unseen data

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the data for LSTM with Conv1D
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 1])
    return np.array(X), np.array(y)

# Create the dataset with a time step of 24 (assuming hourly data)
time_step = 24
X, y = create_dataset(scaled_data, time_step)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], X.shape[1], 1)

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(y)]

# Build the LSTM model with Conv1D layer
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(time_step, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Make predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# Inverse transform the predictions and actual values
train_predict = scaler.inverse_transform(np.concatenate((np.zeros((train_predict.shape[0], 1)), train_predict), axis=1))[:, 1]
test_predict = scaler.inverse_transform(np.concatenate((np.zeros((test_predict.shape[0], 1)), test_predict), axis=1))[:, 1]
y_train = scaler.inverse_transform(np.concatenate((np.zeros((len(y_train), 1)), y_train.reshape(-1, 1)), axis=1))[:, 1]
y_test = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), 1)), y_test.reshape(-1, 1)), axis=1))[:, 1]

# Plot the results for the months of June and July
june_july_test_indices = df.index[(df.index.month == 6) | (df.index.month == 7)]
plt.figure(figsize=(12, 6))
plt.plot(june_july_test_indices, y_test, label='Actual Appliances energy Kwh')
plt.plot(june_july_test_indices, test_predict, label='Predicted Appliances energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Appliances energy Kwh')
plt.title('Actual vs Predicted Appliances energy Kwh for June and July')
plt.legend()
plt.show()

# Save the plot
fig_acc.savefig("LSTM_Conv1D_predictions_June_July.png")


In [None]:
import pandas as pd

# Assuming 'data' is the original dataframe with a datetime index and the actual values
original_df = pd.DataFrame(data, columns=['Timestamp', 'Actual Appliances energy Kwh'])
original_df.index = pd.to_datetime(original_df.index)

# Creating dataframes for predictions
train_predict_df = pd.DataFrame(train_predict, index=original_df.index[:len(train_predict)], columns=['Train Predicted Appliances energy Kwh'])
test_predict_df = pd.DataFrame(test_predict, index=original_df.index[len(train_predict):(len(train_predict) + len(test_predict))], columns=['Test Predicted Appliances energy Kwh'])

# Concatenate original and predicted dataframes
merged_df = pd.concat([original_df, train_predict_df, test_predict_df], axis=1)

# Export to Excel
merged_df.to_excel("original_and_predicted_data.xlsx", index=True)

print("The dataset has been successfully exported to 'original_and_predicted_data.xlsx'.")


In [None]:
# Download the file from above

from google.colab import files
files.download('original_and_predicted_data.xlsx')


In [None]:
# Plot the results for the months of June and July
# Filter for June and July in the original DataFrame (before create_dataset)
june_july_indices = df.index[(df.index.month == 6) | (df.index.month == 7)]

# Get the corresponding indices in y_test
june_july_test_indices_bool = np.isin(df.index[-len(y_test):], june_july_indices)
june_july_test_indices = df.index[-len(y_test):][june_july_test_indices_bool]
june_july_y_test = y_test[june_july_test_indices_bool]
june_july_test_predict = test_predict[june_july_test_indices_bool]


plt.figure(figsize=(12, 6))
# Use the filtered data for plotting
plt.plot(june_july_test_indices, june_july_y_test, label='Actual Appliances energy Kwh')
plt.plot(june_july_test_indices, june_july_test_predict, label='Predicted Appliances energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Appliances energy Kwh')
plt.title('Actual vs Predicted Appliances energy Kwh for June and July')
plt.legend()
plt.show()

# Save the plot
fig_acc.savefig("LSTM_Conv1D_predictions_June_July.png")

Basic statistical analysis

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare the data for LSTM with Conv1D
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 1])
    return np.array(X), np.array(y)

# Create the dataset with a time step of 24 (assuming hourly data)
time_step = 24
X, y = create_dataset(scaled_data, time_step)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], X.shape[1], 1)

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(y)]

# Build the LSTM model with Conv1D layer
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(time_step, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Make predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

# Inverse transform the predictions and actual values
train_predict = scaler.inverse_transform(np.concatenate((np.zeros((train_predict.shape[0], 1)), train_predict), axis=1))[:, 1]
test_predict = scaler.inverse_transform(np.concatenate((np.zeros((test_predict.shape[0], 1)), test_predict), axis=1))[:, 1]
y_train = scaler.inverse_transform(np.concatenate((np.zeros((len(y_train), 1)), y_train.reshape(-1, 1)), axis=1))[:, 1]
y_test = scaler.inverse_transform(np.concatenate((np.zeros((len(y_test), 1)), y_test.reshape(-1, 1)), axis=1))[:, 1]

# Create a dataframe for the y_test to align with the original data indices
y_test_df = pd.DataFrame(y_test, index=original_df.index[-len(y_test):], columns=['Actual Appliances energy Kwh'])

# Create a dataframe for the test predictions to align with the original data indices
test_predict_df = pd.DataFrame(test_predict, index=original_df.index[-len(test_predict):], columns=['Predicted Appliances energy Kwh'])

# Filter the indices for June and July
june_july_indices = y_test_df[(y_test_df.index.month == 6) | (y_test_df.index.month == 7)].index

# Plot the results for the months of June and July
plt.figure(figsize=(12, 6))
plt.plot(june_july_indices, y_test_df.loc[june_july_indices], label='Actual Appliances energy Kwh')
plt.plot(june_july_indices, test_predict_df.loc[june_july_indices], label='Predicted Appliances energy Kwh')
plt.xlabel('Timestamp')
plt.ylabel('Appliances energy Kwh')
plt.title('Actual vs Predicted Appliances energy Kwh for June and July')
plt.legend()
plt.show()

# Save the plot
plt.savefig("LSTM_Conv1D_predictions_June_July.png")


In [None]:
from scipy.stats import pearsonr

# Pearson correlation between 'Appliances energy Kwh' and 'Outside Temperature  °C'

corr, p_value = pearsonr(df['Appliances energy Kwh'], df['Outside Temperature  °C'])
print(f"Pearson correlation: {corr}, P-value: {p_value}")

A Pearson correlation of 0.099 suggests a very weak positive linear relationship between the two variables you're analyzing. This means that as one variable increases, the other variable tends to increase slightly as well, but the relationship is not strong or reliable.

The P-value of 2.6247e-44 is extremely small, indicating that the observed correlation is highly statistically significant.

In [None]:
import statsmodels.api as sm

# Example: Linear regression of 'Appliances energy Kwh' on 'Outside Temperature °C'
X = df['Outside Temperature  °C']
y = df['Appliances energy Kwh']
X = sm.add_constant(X)  # Adds a constant term to the predictor

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

# Print out the statistics
model.summary()


In [None]:
from scipy.stats import pearsonr

# Pearson correlation between 'Appliances energy Kwh' and 'Outside RH%'

corr, p_value = pearsonr(df['Appliances energy Kwh'], df['Outside RH%'])
print(f"Pearson correlation: {corr}, P-value: {p_value}")

weak negative linear relationship between the two variables you're analyzing. In practical terms, this means that as one variable increases, the other tends to slightly decrease, but the relationship is not strong or consistent.

The P-value of 1.0775e-102 is extremely small, indicating that the observed correlation is highly statistically significant