In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
df= pd.read_csv("/kaggle/input/appliances-energy-prediction/KAG_energydata_complete.csv", 
                index_col= "date", parse_dates= True, date_format= "%Y-%m-%d %H:%M:%S").reset_index()
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
list(df.columns)

In [None]:
#restructuring the columns
df = df.reindex(columns= sorted(df.columns))
df.head()

In [None]:
df.isnull().sum()

There are no null values in the dataset

In [None]:
df["weekday"]= np.where(df["date"].dt.weekday>4, 0, 1)
df["weekday"]

In [None]:
df.head()

In [None]:
len(df.columns)

In [None]:
nrows= 5
ncols= int(len(df.columns)/nrows)

fig, axes= plt.subplots(nrows= nrows, ncols= ncols, figsize= (30, 20))
axes= axes.flatten()

# plt.subplots_adjust(hspace=0.05, wspace= 0.0005)


for i, feature in enumerate(df.columns):
    sns.histplot(data= df, x= feature, kde=True, ax= axes[i])
    axes[i].set_title(feature)
    
fig.suptitle("Distribution of Features", fontsize=18)

plt.tight_layout()
plt.show()

In [None]:
nrows= 5
ncols= int(len(df.columns)/nrows)

fig, axes= plt.subplots(nrows= nrows, ncols= ncols, figsize= (30, 20))
axes= axes.flatten()

# plt.subplots_adjust(hspace=0.05, wspace= 0.0005)


for i, feature in enumerate(df.drop(columns= ["date"]).columns):
    sns.boxplot(data= df, y= feature, ax= axes[i])
    axes[i].set_title(feature)
    
fig.suptitle("Outlier Detection among Features", fontsize=18)

plt.tight_layout()
plt.show()

features- press_mm_hg, rh_1, rh_2, rh_3, rh_5, rh_7, rh_8, rh_9, rh_out, t1, t2, t2, t4, t5, t6, t7, t8, t_out, tdewpoint, visibility, windspeed all have outliers, which need to be treated before further model development.

In [None]:
df["lights"].value_counts()

lights feature can be treated as a categorical variable, as the numbers of lights is always one of the 8 given values, also one value (0) significantly dominates the others.

    We can either map the 0 value as 0 and the rest as 1, or perform one hot encoding.
    The first option seems better as 0 is quite dominant with more than 15000 entries out of a total of 19,735.

In [None]:
df["lights"]= np.where(df["lights"]==0, 0, 1)

In [None]:
from scipy.stats import probplot

nrows= 5
ncols= int(len(df.columns)/nrows)

fig, axes= plt.subplots(nrows= nrows, ncols= ncols, figsize= (30, 10))
axes= axes.flatten()

# plt.subplots_adjust(hspace=0.5)


for i, feature in enumerate(df.drop(columns= ["date"]).columns): 
    probplot(df[feature], dist="norm", plot=axes[i])
    axes[i].set_title(feature)
    
fig.suptitle("Distribution of Numerical Features\n(QQ-Plot)", fontsize=18)

plt.tight_layout()
plt.show()

most columns seem to be normally distributed, except for windspeed, rv2, rv1 and rh_out

In [None]:
#bivariate analysis

nrows= 5
ncols= int(len(df.columns)/nrows)

fig, axes= plt.subplots(nrows= nrows, ncols= ncols, figsize= (30, 10))
axes= axes.flatten()

# plt.subplots_adjust(hspace=0.5)


for i, col in enumerate(df.drop(columns= ["Appliances", "date"]).columns):
    x= col.replace("_", " ").title()
    feature= df[col]
    outcome_var= df["Appliances"]
    correlation= feature.corr(outcome_var)
    axes[i].scatter(feature, outcome_var)
    axes[i].set_title(f"Price v/s {x} \n correlation: {correlation: .2f}")
    
    z= np.polyfit(feature, outcome_var, deg= 1)
    y_hat= np.poly1d(z)(feature)
    
    axes[i].plot(feature, y_hat, "r--", lw= 1)
    
    
fig.suptitle("Relation b/w outcome variable and features", fontsize=18)

plt.tight_layout()
plt.show()

In [None]:
#correlation chart

fig= plt.figure(figsize= (20, 20))
sns.heatmap(abs(round(df.drop(columns= ["Appliances", "date"]).corr(), 2)), annot= True)
plt.show()

In [None]:
X= df.drop(columns= ["Appliances"])
y= df["Appliances"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy = X_copy.drop(columns=self.columns_to_drop)
        return X_copy

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', ColumnDropper(["date"]), X.columns)
    ],
    remainder='passthrough'  # Pass through the other columns
)

In [None]:
from sklearn.pipeline import Pipeline

pipeline= Pipeline([
    (),
    (),
    (),
])