In [1]:
import pandas as pd

# Load and scale data for different bins
def load_and_scale(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    return df

dfb1 = load_and_scale('FNSB_B1.csv', 24293)
dfsal1 = load_and_scale('Salcha_bin1.csv', 275)
dfsal2 = load_and_scale('Salcha_bin2.csv', 183)
dfsal3 = load_and_scale('Salcha_bin3.csv', 18)
dffbk1 = load_and_scale('FBK_bin1.csv', 15840)
dffbk2 = load_and_scale('FBK_bin2.csv', 853)
dffbk3 = load_and_scale('FBK_bin3.csv', 10)
dfnp1 = load_and_scale('NP_bin1.csv', 6329)
dfnp2 = load_and_scale('NP_bin2.csv', 778)
dfnp3 = load_and_scale('NP_bin3.csv', 7)

# Function to merge dataframes and calculate average
# Function to merge dataframes without averaging
def merge_dataframes(dfs):
    df_merged = pd.merge(dfs[0][['timestamp', 'HP_kWh']], dfs[1][['timestamp', 'HP_kWh']], on='timestamp', suffixes=('_1', '_2'))
    for i, df in enumerate(dfs[2:], start=3):  # start=3 because we already have _1 and _2
        df_merged = pd.merge(df_merged, df[['timestamp', 'HP_kWh']], on='timestamp')
        df_merged.rename(columns={'HP_kWh': f'HP_kWh_{i}'}, inplace=True)  # Correctly rename the last merged column
    return df_merged


# Merge and average for each set
dfsal = merge_dataframes([dfsal1, dfsal2, dfsal3])
dffbk = merge_dataframes([dffbk1, dffbk2, dffbk3])
dfnp = merge_dataframes([dfnp1, dfnp2, dfnp3])

# Combining averages
dfc3 = pd.DataFrame()
dfc3['timestamp'] = dfsal['timestamp']
dfc3['Total_HP_kWh'] = dfsal.iloc[:, 1:].sum(axis=1) + dffbk.iloc[:, 1:].sum(axis=1) + dfnp.iloc[:, 1:].sum(axis=1)

# Merging and calculating variances
merged_df = pd.merge(dfb1, dfc3, on='timestamp', suffixes=('_b1', '_c3'))
merged_df['variance'] = (merged_df['HP_kWh'] - merged_df['Total_HP_kWh']).abs() # Adjust column names correctly
merged_df['percent_difference'] = merged_df['variance'] / merged_df[['HP_kWh', 'Total_HP_kWh']].mean(axis=1) * 100

# Sorting and selecting top variances
top_variances = merged_df.sort_values(by='variance', ascending=False).head(5)

# Display the results
print(top_variances[['timestamp', 'HP_kWh', 'Total_HP_kWh', 'variance', 'percent_difference']])


Empty DataFrame
Columns: [timestamp, HP_kWh, Total_HP_kWh, variance, percent_difference]
Index: []


In [4]:
import pandas as pd

# Load and scale data for different bins
def load_and_scale(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    return df

dfb1 = load_and_scale('FNSB_B1.csv', 24293)
dfsal1 = load_and_scale('Salcha_bin1.csv', 275)
dfsal2 = load_and_scale('Salcha_bin2.csv', 183)
dfsal3 = load_and_scale('Salcha_bin3.csv', 18)
dffbk1 = load_and_scale('FBK_bin1.csv', 15840)
dffbk2 = load_and_scale('FBK_bin2.csv', 853)
dffbk3 = load_and_scale('FBK_bin3.csv', 10)
dfnp1 = load_and_scale('NP_bin1.csv', 6329)
dfnp2 = load_and_scale('NP_bin2.csv', 778)
dfnp3 = load_and_scale('NP_bin3.csv', 7)

# Function to merge dataframes
def merge_dataframes(dfs):
    df_merged = pd.merge(dfs[0][['timestamp', 'HP_kWh']], dfs[1][['timestamp', 'HP_kWh']], on='timestamp', suffixes=('_1', '_2'))
    for i, df in enumerate(dfs[2:], start=3):  # start=3 because we already have _1 and _2
        df_merged = pd.merge(df_merged, df[['timestamp', 'HP_kWh']], on='timestamp')
        df_merged.rename(columns={'HP_kWh': f'HP_kWh_{i}'}, inplace=True)  # Correctly rename the last merged column
    return df_merged

# Merge for each set
dfsal = merge_dataframes([dfsal1, dfsal2, dfsal3])
dffbk = merge_dataframes([dffbk1, dffbk2, dffbk3])
dfnp = merge_dataframes([dfnp1, dfnp2, dfnp3])

# Combining averages
dfc3 = pd.DataFrame()
dfc3['timestamp'] = dfsal['timestamp']
dfc3['Total_HP_kWh'] = dfsal.iloc[:, 1:].sum(axis=1) + dffbk.iloc[:, 1:].sum(axis=1) + dfnp.iloc[:, 1:].sum(axis=1)

# Calculate the total HP_kWh for each DataFrame
total_HP_kWh_b1 = dfb1['HP_kWh'].sum()
total_HP_kWh_c3 = dfc3['Total_HP_kWh'].sum()

# Calculate variance and percent difference
variance = abs(total_HP_kWh_b1 - total_HP_kWh_c3)
percent_difference = (variance / ((total_HP_kWh_b1 + total_HP_kWh_c3) / 2)) * 100

# Display the results
print(f"Total HP_kWh in dfb1: {total_HP_kWh_b1}")
print(f"Total HP_kWh in dfc3: {total_HP_kWh_c3}")
print(f"Variance: {variance}")
print(f"Percent Difference: {percent_difference:.2f}%")


Total HP_kWh in dfb1: 6479794627.753269
Total HP_kWh in dfc3: 71463865.70687741
Variance: 6408330762.0463915
Percent Difference: 195.64%


In [8]:
import pandas as pd

# Load data without scaling
def load_data(file_path):
    return pd.read_csv(file_path)

# Load and scale data
def load_and_scale(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    return df

# Loading dataframes without scaling
raw_dfb1 = load_data('FNSB_B1.csv')
raw_dfsal1 = load_data('Salcha_bin1.csv')
raw_dfsal2 = load_data('Salcha_bin2.csv')
raw_dfsal3 = load_data('Salcha_bin3.csv')
raw_dffbk1 = load_data('FBK_bin1.csv')
raw_dffbk2 = load_data('FBK_bin2.csv')
raw_dffbk3 = load_data('FBK_bin3.csv')
raw_dfnp1 = load_data('NP_bin1.csv')
raw_dfnp2 = load_data('NP_bin2.csv')
raw_dfnp3 = load_data('NP_bin3.csv')

# Print some sample rows of raw data
print("Sample raw data from dfb1:")
print(raw_dfb1.head())
print("Sample raw data from dfsal1:")
print(raw_dfsal1.head())
print("Sample raw data from dfsal2:")
print(raw_dfsal2.head())
print("Sample raw data from dfsal3:")
print(raw_dfsal3.head())

# Print sum of raw HP_kWh
print(f"Raw Total HP_kWh in raw_dfb1: {raw_dfb1['HP_kWh'].sum()}")
print(f"Raw Total HP_kWh in raw_dfsal1: {raw_dfsal1['HP_kWh'].sum()}")
print(f"Raw Total HP_kWh in raw_dfsal2: {raw_dfsal2['HP_kWh'].sum()}")
print(f"Raw Total HP_kWh in raw_dfsal3: {raw_dfsal3['HP_kWh'].sum()}")

# Loading dataframes with scaling
dfb1 = load_and_scale('FNSB_B1.csv', 24293)
dfsal1 = load_and_scale('Salcha_bin1.csv', 275)
dfsal2 = load_and_scale('Salcha_bin2.csv', 183)
dfsal3 = load_and_scale('Salcha_bin3.csv', 18)
dffbk1 = load_and_scale('FBK_bin1.csv', 15840)
dffbk2 = load_and_scale('FBK_bin2.csv', 853)
dffbk3 = load_and_scale('FBK_bin3.csv', 10)
dfnp1 = load_and_scale('NP_bin1.csv', 6329)
dfnp2 = load_and_scale('NP_bin2.csv', 778)
dfnp3 = load_and_scale('NP_bin3.csv', 7)

# Print some sample rows of scaled data
print("Sample scaled data from dfb1:")
print(dfb1.head())
print("Sample scaled data from dfsal1:")
print(dfsal1.head())
print("Sample scaled data from dfsal2:")
print(dfsal2.head())
print("Sample scaled data from dfsal3:")
print(dfsal3.head())

# Check sum of scaled HP_kWh
print(f"Scaled Total HP_kWh in dfb1: {dfb1['HP_kWh'].sum()}")
print(f"Scaled Total HP_kWh in dfsal1: {dfsal1['HP_kWh'].sum()}")
print(f"Scaled Total HP_kWh in dfsal2: {dfsal2['HP_kWh'].sum()}")
print(f"Scaled Total HP_kWh in dfsal3: {dfsal3['HP_kWh'].sum()}")

# Function to merge dataframes
def merge_dataframes(dfs):
    df_merged = pd.merge(dfs[0][['timestamp', 'HP_kWh']], dfs[1][['timestamp', 'HP_kWh']], on='timestamp', suffixes=('_1', '_2'))
    for i, df in enumerate(dfs[2:], start=3):  # start=3 because we already have _1 and _2
        df_merged = pd.merge(df_merged, df[['timestamp', 'HP_kWh']], on='timestamp')
        df_merged.rename(columns={'HP_kWh': f'HP_kWh_{i}'}, inplace=True)  # Correctly rename the last merged column
    return df_merged

# Merging dataframes for each set
dfsal = merge_dataframes([dfsal1, dfsal2, dfsal3])
dffbk = merge_dataframes([dffbk1, dffbk2, dffbk3])
dfnp = merge_dataframes([dfnp1, dfnp2, dfnp3])

# Check the combined dataframes
print(f"dfsal shape: {dfsal.shape}")
print(f"dffbk shape: {dffbk.shape}")
print(f"dfnp shape: {dfnp.shape}")
print(dfsal.head())
print(dffbk.head())
print(dfnp.head())

# Combining averages
dfc3 = pd.DataFrame()
dfc3['timestamp'] = dfsal['timestamp']
dfc3['Total_HP_kWh'] = dfsal.iloc[:, 1:].sum(axis=1) + dffbk.iloc[:, 1:].sum(axis=1) + dfnp.iloc[:, 1:].sum(axis=1)

# Check the combined dataframe
print(f"dfc3 shape: {dfc3.shape}")
print(dfc3.head())

# Check sums of individual dataframes before combining
total_HP_kWh_dfsal = dfsal.iloc[:, 1:].sum().sum()
total_HP_kWh_dffbk = dffbk.iloc[:, 1:].sum().sum()
total_HP_kWh_dfnp = dfnp.iloc[:, 1:].sum().sum()

print(f"Total HP_kWh in dfsal: {total_HP_kWh_dfsal}")
print(f"Total HP_kWh in dffbk: {total_HP_kWh_dffbk}")
print(f"Total HP_kWh in dfnp: {total_HP_kWh_dfnp}")

# Calculate the total HP_kWh for each DataFrame
total_HP_kWh_b1 = dfb1['HP_kWh'].sum()
total_HP_kWh_c3 = dfc3['Total_HP_kWh'].sum()

# Calculate variance and percent difference
variance = abs(total_HP_kWh_b1 - total_HP_kWh_c3)
percent_difference = (variance / ((total_HP_kWh_b1 + total_HP_kWh_c3) / 2)) * 100

# Display the results
print(f"Total HP_kWh in dfb1: {total_HP_kWh_b1}")
print(f"Total HP_kWh in dfc3: {total_HP_kWh_c3}")
print(f"Variance: {variance}")
print(f"Percent Difference: {percent_difference:.2f}%")


Sample raw data from dfb1:
    Total_kWh     HP_kWh     Sec_kWh      timestamp
0  427.307267  14.431828  412.875440  1/1/2018 0:30
1  432.002674  14.992676  417.009998  1/1/2018 1:30
2  433.630780  14.431828  419.198952  1/1/2018 2:30
3  434.870151  13.978810  420.891340  1/1/2018 3:30
4  437.241468  13.978810  423.262657  1/1/2018 4:30
Sample raw data from dfsal1:
   Total_kWh  HP_kWh   Sec_kWh      timestamp
0   3.887217     0.0  3.887217  1/1/2018 0:30
1   3.916056     0.0  3.916056  1/1/2018 1:30
2   3.964121     0.0  3.964121  1/1/2018 2:30
3   4.002573     0.0  4.002573  1/1/2018 3:30
4   4.031412     0.0  4.031412  1/1/2018 4:30
Sample raw data from dfsal2:
   Total_kWh  HP_kWh   Sec_kWh      timestamp
0   6.804754     0.0  6.804754  1/1/2018 0:30
1   6.854242     0.0  6.854242  1/1/2018 1:30
2   6.936720     0.0  6.936720  1/1/2018 2:30
3   7.002704     0.0  7.002704  1/1/2018 3:30
4   7.052191     0.0  7.052191  1/1/2018 4:30
Sample raw data from dfsal3:
   Total_kWh  HP_kWh  

In [3]:
import pandas as pd

# Load and scale data for different bins
def load_and_scale(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load the data
dfb1 = load_and_scale('FNSB_B1.csv', 24293)
dfsal1 = load_and_scale('Salcha_bin1.csv', 275)
dfsal2 = load_and_scale('Salcha_bin2.csv', 183)
dfsal3 = load_and_scale('Salcha_bin3.csv', 18)
dffbk1 = load_and_scale('FBK_bin1.csv', 15840)
dffbk2 = load_and_scale('FBK_bin2.csv', 853)
dffbk3 = load_and_scale('FBK_bin3.csv', 10)
dfnp1 = load_and_scale('NP_bin1.csv', 6329)
dfnp2 = load_and_scale('NP_bin2.csv', 778)
dfnp3 = load_and_scale('NP_bin3.csv', 7)

# Function to merge dataframes without averaging
def merge_dataframes(dfs):
    df_merged = pd.merge(dfs[0][['timestamp', 'HP_kWh']], dfs[1][['timestamp', 'HP_kWh']], on='timestamp', suffixes=('_1', '_2'))
    for i, df in enumerate(dfs[2:], start=3):
        df_merged = pd.merge(df_merged, df[['timestamp', 'HP_kWh']], on='timestamp')
        df_merged.rename(columns={'HP_kWh': f'HP_kWh_{i}'}, inplace=True)
    return df_merged

# Merge and average for each set
dfsal = merge_dataframes([dfsal1, dfsal2, dfsal3])
dffbk = merge_dataframes([dffbk1, dffbk2, dffbk3])
dfnp = merge_dataframes([dfnp1, dfnp2, dfnp3])

# Combining averages
dfc3 = pd.DataFrame()
dfc3['timestamp'] = dfsal['timestamp']
dfc3['Total_HP_kWh'] = dfsal.filter(like='HP_kWh').sum(axis=1) + dffbk.filter(like='HP_kWh').sum(axis=1) + dfnp.filter(like='HP_kWh').sum(axis=1)

# Merging and calculating variances
merged_df = pd.merge(dfb1, dfc3, on='timestamp', suffixes=('_b1', '_c3'))
merged_df['variance'] = (merged_df['HP_kWh'] - merged_df['Total_HP_kWh']).abs()
merged_df['percent_difference'] = merged_df['variance'] / merged_df[['HP_kWh', 'Total_HP_kWh']].mean(axis=1) * 100

# Sorting and selecting top percent differences
top_percent_differences = merged_df.sort_values(by='percent_difference', ascending=False).head(5)

# Display the results
print(top_percent_differences[['timestamp', 'HP_kWh', 'Total_HP_kWh', 'variance', 'percent_difference']])


               timestamp        HP_kWh   Total_HP_kWh      variance  \
191  2018-01-08 23:30:00  1.776661e+06  269439.532046  1.507222e+06   
7751 2018-11-19 23:30:00  1.755723e+06  266703.455851  1.489020e+06   
1709 2018-03-13 05:30:00  1.800526e+06  273602.386172  1.526923e+06   
1711 2018-03-13 07:30:00  1.800526e+06  273602.386172  1.526923e+06   
2140 2018-03-31 04:30:00  2.618136e+06  398036.392951  2.220100e+06   

      percent_difference  
191           147.326242  
7751          147.250809  
1709          147.235204  
1711          147.235204  
2140          147.213053  


In [5]:
import pandas as pd

# Load and scale data for different bins
def load_and_scale(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load the data
dfb1 = load_and_scale('FNSB_B1.csv', 24293)
dfsal1 = load_and_scale('Salcha_bin1.csv', 275)
dfsal2 = load_and_scale('Salcha_bin2.csv', 183)
dfsal3 = load_and_scale('Salcha_bin3.csv', 18)
dffbk1 = load_and_scale('FBK_bin1.csv', 15840)
dffbk2 = load_and_scale('FBK_bin2.csv', 853)
dffbk3 = load_and_scale('FBK_bin3.csv', 10)
dfnp1 = load_and_scale('NP_bin1.csv', 6329)
dfnp2 = load_and_scale('NP_bin2.csv', 778)
dfnp3 = load_and_scale('NP_bin3.csv', 7)

# Function to merge dataframes
def merge_dataframes(dfs):
    df_merged = pd.merge(dfs[0][['timestamp', 'HP_kWh']], dfs[1][['timestamp', 'HP_kWh']], on='timestamp', suffixes=('_1', '_2'))
    for i, df in enumerate(dfs[2:], start=3):  # start=3 because we already have _1 and _2
        df_merged = pd.merge(df_merged, df[['timestamp', 'HP_kWh']], on='timestamp')
        df_merged.rename(columns={'HP_kWh': f'HP_kWh_{i}'}, inplace=True)  # Correctly rename the last merged column
    return df_merged

# Merge for each set
dfsal = merge_dataframes([dfsal1, dfsal2, dfsal3])
dffbk = merge_dataframes([dffbk1, dffbk2, dffbk3])
dfnp = merge_dataframes([dfnp1, dfnp2, dfnp3])

# Combining averages
dfc3 = pd.DataFrame()
dfc3['timestamp'] = dfsal['timestamp']
dfc3['Total_HP_kWh'] = dfsal.filter(like='HP_kWh').sum(axis=1) + dffbk.filter(like='HP_kWh').sum(axis=1) + dfnp.filter(like='HP_kWh').sum(axis=1)

# Merging and calculating variances
merged_df = pd.merge(dfb1, dfc3, on='timestamp', suffixes=('_b1', '_c3'))
merged_df['variance'] = (merged_df['HP_kWh'] - merged_df['Total_HP_kWh']).abs()
merged_df['percent_difference'] = merged_df['variance'] / merged_df[['HP_kWh', 'Total_HP_kWh']].mean(axis=1) * 100



# Calculate the total HP_kWh for each DataFrame
total_HP_kWh_b1 = dfb1['HP_kWh'].sum()
total_HP_kWh_c3 = dfc3['Total_HP_kWh'].sum()

# Calculate overall variance and percent difference
overall_variance = abs(total_HP_kWh_b1 - total_HP_kWh_c3)
overall_percent_difference = (overall_variance / ((total_HP_kWh_b1 + total_HP_kWh_c3) / 2)) * 100

# Display the overall results
print(f"Total HP_kWh in dfb1: {total_HP_kWh_b1}")
print(f"Total HP_kWh in dfc3: {total_HP_kWh_c3}")
print(f"Overall Variance: {overall_variance}")
print(f"Overall Percent Difference: {overall_percent_difference:.2f}%")


Total HP_kWh in dfb1: 6479794627.753269
Total HP_kWh in dfc3: 1945444370.6714609
Overall Variance: 4534350257.081808
Overall Percent Difference: 107.64%
