# Household Power Consumption - Clustering

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
power = pd.read_csv('/kaggle/input/electric-power-consumption-data-set/household_power_consumption.txt',delimiter=';',low_memory=False)
power.head()

In [None]:
print('Shape of Power Consumption Data:',power.shape)

In [None]:
power.info()

## Exploratory Data Analysis

### Null Values

In [None]:
null_values = power.isna().sum()

In [None]:
null_values.plot(kind='bar',cmap='viridis')
plt.show()

### Replacing ? with null

In [None]:
power.replace('?',pd.NA,inplace=True)

In [None]:
null_values = power.isna().sum()

In [None]:
null_values.plot(kind='bar',cmap='viridis')
plt.show()

In [None]:
power.dropna(inplace=True)

In [None]:
print('Shape of Power Consumption Data after removing null values:',power.shape)

In [None]:
power.head()

### Changing dtypes of columns to float datatype

In [None]:
power['Global_active_power'] = power['Global_active_power'].astype('float')
power['Global_reactive_power'] = power['Global_reactive_power'].astype('float')
power['Voltage'] = power['Voltage'].astype('float')
power['Global_intensity'] = power['Global_intensity'].astype('float')
power['Sub_metering_1'] = power['Sub_metering_1'].astype('float')
power['Sub_metering_2'] = power['Sub_metering_2'].astype('float')
power['Sub_metering_3'] = power['Sub_metering_3'].astype('float')

In [None]:
power.info()

In [None]:
power.describe()

### Boxplot of Columns

In [None]:
cols = ['Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3']

for column in cols:
    sns.boxplot(power[column])
    plt.title(f'Box plot of {column}')
    plt.xlabel(f'{column}')
    plt.show()

### Preparing Summary DataFrame grouped by Date

In [None]:
active_power = power.groupby('Date')['Global_active_power'].sum()
voltage_avg = power.groupby('Date')['Voltage'].mean()
intensity = power.groupby('Date')['Global_intensity'].sum()
submeter_1 = power.groupby('Date')['Sub_metering_1'].sum()
submeter_2 = power.groupby('Date')['Sub_metering_2'].sum()
submeter_3 = power.groupby('Date')['Sub_metering_3'].sum()

In [None]:
total_energy = pd.merge(active_power,voltage_avg,on='Date',how='inner')

In [None]:
total_energy_2 = total_energy.merge(intensity,on='Date',how='inner')
total_energy_3 = total_energy_2.merge(submeter_1,on='Date',how='inner')
total_energy_4 = total_energy_3.merge(submeter_2,on='Date',how='inner')
energy = total_energy_4.merge(submeter_3,on='Date',how='inner')

In [None]:
energy.head()

## Preprocessing 

In [None]:
df_electric = energy.copy()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

attributes = list(df_electric.columns)
num_pipeline = make_pipeline(StandardScaler())
preprocessing = ColumnTransformer([
    ('num',num_pipeline,attributes)
])

## PCA and KMeans

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

pca = make_pipeline(preprocessing,PCA(n_components=.95))

In [None]:
inertias = []
n_clusters = [2,3,4,5,6,7,8]

for cluster in n_clusters:
    kmeans_electric = make_pipeline(pca,KMeans(n_clusters=cluster,n_init='auto',random_state=42))
    kmeans_electric.fit(df_electric)
    inertias.append(kmeans_electric['kmeans'].inertia_)


In [None]:
plt.plot(n_clusters,inertias)
plt.title('Knee-Elbow Plot')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.grid()
plt.show()

Number of clusters chosen according to Knee-Elbow Plot = 3

In [None]:
kmeans_e3 = make_pipeline(pca,KMeans(n_clusters=3,n_init='auto',random_state=42))
kmeans_e3.fit(df_electric)

In [None]:
energy['cluster'] = kmeans_e3['kmeans'].labels_

## Cluster Analysis

### Global Active Power

In [None]:
sns.boxplot(data=energy,x='cluster',y='Global_active_power')
plt.show()

## Voltage

In [None]:
sns.boxplot(data=energy,x='cluster',y='Voltage')
plt.show()

### Global Intensity

In [None]:
sns.boxplot(data=energy,x='cluster',y='Global_intensity')
plt.show()

### Sub-metering-1

In [None]:
sns.boxplot(data=energy,x='cluster',y='Sub_metering_1')
plt.show()

### Sub metering -2

In [None]:
sns.boxplot(data=energy,x='cluster',y='Sub_metering_2')
plt.show()

### Sub metering-3

In [None]:
sns.boxplot(data=energy,x='cluster',y='Sub_metering_3')
plt.show()

In [None]:
cluster_0 = energy[energy.cluster == 0]
cluster_1 = energy[energy.cluster == 1]
cluster_2 = energy[energy.cluster == 2]

In [None]:
fig,axes = plt.subplots(1,3,figsize=(10,8))
sns.boxplot(data=cluster_0,x='Sub_metering_1',ax=axes[0])
axes[0].set_title('Box Plot of Sub-metering-1 Cluster 0')
sns.boxplot(data=cluster_0,x='Sub_metering_2',ax=axes[1])
axes[1].set_title('Box Plot of Sub-metering-2 Cluster 0')
sns.boxplot(data=cluster_0,x='Sub_metering_3',ax=axes[2])
axes[2].set_title('Box Plot of Sub-metering_3 Cluster 0')
plt.tight_layout()
plt.show()

In [None]:
fig,axes = plt.subplots(1,3,figsize=(10,8))
sns.boxplot(data=cluster_1,x='Sub_metering_1',ax=axes[0])
axes[0].set_title('Box Plot of Submetering_1 Cluster 1')
sns.boxplot(data=cluster_1,x='Sub_metering_2',ax=axes[1])
axes[1].set_title('Box Plot of Sub_metering_2 Cluster 1')
sns.boxplot(data=cluster_1,x='Sub_metering_3',ax=axes[2])
axes[2].set_title('Box Plot of Sub_metering_3 Cluster 1')
plt.tight_layout()
plt.show()

In [None]:
fig,axes = plt.subplots(1,3,figsize=(10,8))
sns.boxplot(data=cluster_2,x='Sub_metering_1',ax=axes[0])
axes[0].set_title('Box plot of Sub_metering_1 Cluster 2')
sns.boxplot(data=cluster_2,x='Sub_metering_2',ax=axes[1])
axes[1].set_title('Box plot of Sub_metering_2 Cluster 2')
sns.boxplot(data=cluster_2,x='Sub_metering_3',ax=axes[2])
axes[2].set_title('Box plot of Sub_metering_3 Cluster 2')
plt.tight_layout()
plt.show()

In [None]:
energy_transformed = kmeans_e3.transform(df_electric)

In [None]:
energy_transformed.shape

In [None]:
df_transformed = pd.DataFrame(energy_transformed,columns=['transformed_1','transformed_2','transformed_3'])

In [None]:
df_transformed.head()

In [None]:
df_transformed['cluster'] = kmeans_e3['kmeans'].labels_

## Transformed Data Analysis

In [None]:
sns.scatterplot(data=df_transformed,x='transformed_1',y='transformed_2',hue='cluster')
plt.show()

In [None]:
sns.scatterplot(data=df_transformed,x='transformed_1',y='transformed_3',hue='cluster')
plt.show()

In [None]:
sns.scatterplot(data=df_transformed,x='transformed_2',y='transformed_3',hue='cluster')
plt.show()

## Conclusion

### Cluster 0

It represents days having a High Active power,Medium Voltage and a High Global Intensity. The amount of energy consumed by Air Conditioner and Water heater were high. The energy consumed by the Kitchen and Laundry were higher than Clusters 1 and 2.

### Cluster 1

It represents days having a low Active Power,Medium Voltage and  low Global Intensity. The amount of energy consumed by Kitchen,Laundry,Air Conditioner and Water Heater were all low.

### Cluster 2

It represents days having a Medium Active Power,High Voltage and Medium Global Intensity. The amount of energy consumed by the Air conditioner and Water Heater was high. Kitchen and Laundry usage was low.