Code to consolidate all training data from previous year and preprocess it

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd

In [12]:
combined_df = pd.read_csv('/content/drive/MyDrive/MLData0.csv')

# Print the names of all the headers
print("Names of all the headers:")
for header in combined_df.columns:
    print(header)

Names of all the headers:
timestamp
Total
Washer
BlowerGH
Lights
BlowerBed
CompGH
CompBed
Dryer
Recs1
Recs2
WaterHeater


In [13]:
# Define the new column names
new_column_names = {
    ' "Total"': 'Main',
}

# Rename columns
combined_df = combined_df.rename(columns=new_column_names)

# Convert columns to numeric, coerce errors to NaN
combined_df.iloc[:, 1:] = combined_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Add a new column "Recs" containing the sum of values from "Recs1" and "Recs2"
combined_df['Recs'] = combined_df['Recs1'] + combined_df['Recs2']
combined_df = combined_df.drop(columns=['Recs1', 'Recs2'])

# Add a new column for HVAC system and consolidate
combined_df['HVAC'] = combined_df['BlowerGH'] + combined_df['BlowerBed'] + combined_df['CompGH'] + combined_df['CompBed']
combined_df = combined_df.drop(columns=['BlowerGH', 'BlowerBed', 'CompGH', 'CompBed'])

columns_to_abs = ['Total', 'Washer', 'Lights', 'Dryer', 'WaterHeater', 'Recs', 'HVAC']
# Take the absolute value of selected columns
combined_df[columns_to_abs] = combined_df[columns_to_abs].abs()

print(combined_df)

            timestamp    Total  Washer  Lights   Dryer  WaterHeater  Recs  \
0     1/18/2023 22:00    164.1     0.5    25.7     0.0          0.0  12.7   
1     1/18/2023 22:01    164.3     0.5    25.6     0.0          0.0  12.7   
2     1/18/2023 22:02    164.1     0.6    25.8     0.0          0.0  12.5   
3     1/18/2023 22:03    164.0     0.5    25.6     0.0          0.0  12.7   
4     1/18/2023 22:04    164.2     0.6    25.7     0.0          0.0  12.5   
...               ...      ...     ...     ...     ...          ...   ...   
5771  1/22/2023 22:11   7091.1     6.4    41.7   970.1          0.0  17.4   
5772  1/22/2023 22:12   1931.8     6.0    43.4     0.1          0.0  13.1   
5773  1/22/2023 22:13   1943.2     6.0    43.5     0.1          0.0  13.1   
5774  1/22/2023 22:14   7393.9     6.4    41.9  1026.7          0.0  17.8   
5775  1/22/2023 22:15  10401.1     6.7    41.0     0.0          0.0  18.6   

        HVAC  
0       33.8  
1       33.8  
2       33.9  
3       33.7  


In [14]:
df = combined_df

# Sort by Timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert to datetime format
df.sort_values(by='timestamp', inplace=True)

# Select columns for processing
columns_to_process = df.columns[df.columns != 'timestamp']

# Replace negative values with their absolute values for selected columns
df[columns_to_process] = df[columns_to_process].abs()

# Replace NaN values with 0 for selected columns
df[columns_to_process] = df[columns_to_process].fillna(0)

# Reset index after removing rows
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,timestamp,Total,Washer,Lights,Dryer,WaterHeater,Recs,HVAC
0,2023-01-18 22:00:00,164.1,0.5,25.7,0.0,0.0,12.7,33.8
1,2023-01-18 22:01:00,164.3,0.5,25.6,0.0,0.0,12.7,33.8
2,2023-01-18 22:02:00,164.1,0.6,25.8,0.0,0.0,12.5,33.9
3,2023-01-18 22:03:00,164.0,0.5,25.6,0.0,0.0,12.7,33.7
4,2023-01-18 22:04:00,164.2,0.6,25.7,0.0,0.0,12.5,33.8
...,...,...,...,...,...,...,...,...
5771,2023-01-22 22:11:00,7091.1,6.4,41.7,970.1,0.0,17.4,889.0
5772,2023-01-22 22:12:00,1931.8,6.0,43.4,0.1,0.0,13.1,882.3
5773,2023-01-22 22:13:00,1943.2,6.0,43.5,0.1,0.0,13.1,886.5
5774,2023-01-22 22:14:00,7393.9,6.4,41.9,1026.7,0.0,17.8,888.7


In [16]:
df = df[df['Total'] != 0]

cols_to_check = ['Washer', 'HVAC', 'Lights', 'Dryer', 'Recs', 'WaterHeater']

# Remove rows where all specified columns have values equal to 0
df = df[~(df[cols_to_check] == 0).all(axis=1)]

# Reset index
df.reset_index(drop=True, inplace=True)

In [18]:
# Select columns containing power usage from each appliance
appliances_columns = ['Washer', 'HVAC', 'Lights', 'Dryer', 'Recs', 'WaterHeater']

# Create a new column "other" by subtracting the sum of individual appliances from "Main"
df['Other'] = df['Total'] - df[appliances_columns].sum(axis=1)

# Remove rows where the value in the "Other" column is greater than 2500
df = df[df['Other'] <= 2500]
df = df[df['Other'] >= 0]

# Reset index after removing rows
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,timestamp,Total,Washer,Lights,Dryer,WaterHeater,Recs,HVAC,Other
0,2023-01-18 22:00:00,164.1,0.5,25.7,0.0,0.0,12.7,33.8,91.4
1,2023-01-18 22:01:00,164.3,0.5,25.6,0.0,0.0,12.7,33.8,91.7
2,2023-01-18 22:02:00,164.1,0.6,25.8,0.0,0.0,12.5,33.9,91.3
3,2023-01-18 22:03:00,164.0,0.5,25.6,0.0,0.0,12.7,33.7,91.5
4,2023-01-18 22:04:00,164.2,0.6,25.7,0.0,0.0,12.5,33.8,91.6
...,...,...,...,...,...,...,...,...,...
5452,2023-01-22 22:07:00,1927.9,6.0,43.5,0.1,0.0,13.0,879.7,985.6
5453,2023-01-22 22:09:00,1927.3,6.0,43.3,0.1,0.0,13.0,880.9,984.0
5454,2023-01-22 22:10:00,1926.0,6.0,43.2,0.1,0.0,13.0,880.1,983.6
5455,2023-01-22 22:12:00,1931.8,6.0,43.4,0.1,0.0,13.1,882.3,986.9


In [19]:
df.to_csv('/content/drive/MyDrive/final_training_data.csv', index=False)

In [None]:
df.head(46000).to_csv('/content/drive/MyDrive/combined_training_data2.csv', index=False)
df.tail(10000).to_csv('/content/drive/MyDrive/open_test2.csv', index=False)

In [None]:
new_df = df[['timestamp', 'Main']].copy() #/content/drive/MyDrive/
new_df['timestamp'] = new_df['timestamp'].astype(str)
new_df['timestamp'] = new_df['timestamp'].str.replace('2023-', '2024-')
new_df['timestamp'] = pd.to_datetime(new_df['timestamp'])  # Convert to datetime format

In [None]:
new_df

Unnamed: 0,timestamp,Main
0,2024-01-01 22:00:00,346.7
1,2024-01-01 22:04:00,810.8
2,2024-01-01 22:06:00,468.1
3,2024-01-01 22:08:00,521.2
4,2024-01-01 22:10:00,484.1
...,...,...
55245,2024-04-21 23:56:00,184.8
55246,2024-04-21 23:57:00,185.1
55247,2024-04-21 23:58:00,185.3
55248,2024-04-21 23:59:00,185.3


In [None]:
new_df.to_csv('/content/drive/MyDrive/blind_test_large.csv', index=False)
new_df.head(1000).to_csv('/content/drive/MyDrive/blind_test_smaller.csv', index=False)
new_df.head(100).to_csv('/content/drive/MyDrive/blind_test_very_small.csv', index=False)