In [3]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('FNSB_B1.csv')
dfbin1 = pd.read_csv('FNSB_Bin1.csv')
dfbin2 = pd.read_csv('FNSB_Bin2.csv')
dfbin3 = pd.read_csv('FNSB_Bin3.csv')

df1['HP_kWh'] *= 24293
dfbin1['HP_kWh'] *= 23016
dfbin2['HP_kWh'] *= 1263
dfbin3['HP_kWh'] *= 14

df2 = pd.DataFrame()
df2['timestamp'] = dfbin1['timestamp']
df2['HP_kWh'] = dfbin1['HP_kWh'] + dfbin2['HP_kWh'] + dfbin3['HP_kWh']  

merged_df = pd.merge(df1, df2, on='timestamp', suffixes=('_b1', '_b3'))


# Calculate the absolute variance and percent difference
merged_df['variance'] = (merged_df['HP_kWh_b1'] - merged_df['HP_kWh_b3']).abs()
merged_df['percent_difference'] = merged_df['variance'] / merged_df[['HP_kWh_b1', 'HP_kWh_b3']].mean(axis=1) * 100

# Sort by variance in descending order and select the top 5
top_variances = merged_df.sort_values(by='variance', ascending=False).head(5)

# Display the results
print(top_variances[['timestamp', 'HP_kWh_b1', 'HP_kWh_b3', 'variance', 'percent_difference']])


           timestamp     HP_kWh_b1     HP_kWh_b3      variance  \
551  1/23/2018 23:30  1.234454e+06  1.176422e+06  58031.175978   
550  1/23/2018 22:30  1.161461e+06  1.106939e+06  54522.263748   
479  1/20/2018 23:30  1.129045e+06  1.076671e+06  52373.865152   
383  1/16/2018 23:30  1.041323e+06  9.905992e+05  50724.103626   
476  1/20/2018 20:30  1.113682e+06  1.063006e+06  50675.941458   

     percent_difference  
551            4.814115  
550            4.807113  
479            4.748920  
383            4.992720  
476            4.656243  


In [4]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('FNSB_B1.csv')
dfbin1 = pd.read_csv('FNSB_Bin1.csv')
dfbin2 = pd.read_csv('FNSB_Bin2.csv')
dfbin3 = pd.read_csv('FNSB_Bin3.csv')

# Scale the HP_kWh values
df1['HP_kWh'] *= 24293
dfbin1['HP_kWh'] *= 23016
dfbin2['HP_kWh'] *= 1263
dfbin3['HP_kWh'] *= 14

# Combine HP_kWh data from different bins
df2 = pd.DataFrame()
df2['timestamp'] = dfbin1['timestamp']
df2['HP_kWh'] = dfbin1['HP_kWh'] + dfbin2['HP_kWh'] + dfbin3['HP_kWh']

# Merge the dataframes on timestamp
merged_df = pd.merge(df1, df2, on='timestamp', suffixes=('_b1', '_b3'))

# Sum the total HP_kWh for b1 and b3
total_HP_kWh_b1 = merged_df['HP_kWh_b1'].sum()
total_HP_kWh_b3 = merged_df['HP_kWh_b3'].sum()

# Calculate the absolute variance and percent difference
variance = abs(total_HP_kWh_b1 - total_HP_kWh_b3)
percent_difference = (variance / ((total_HP_kWh_b1 + total_HP_kWh_b3) / 2)) * 100

# Display the results
print(f"Total HP_kWh_b1: {total_HP_kWh_b1}")
print(f"Total HP_kWh_b3: {total_HP_kWh_b3}")
print(f"Variance: {variance}")
print(f"Percent Difference: {percent_difference:.2f}%")


Total HP_kWh_b1: 2159931542.598756
Total HP_kWh_b3: 2132685746.4028134
Variance: 27245796.195942402
Percent Difference: 1.27%


In [1]:
import pandas as pd

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Rename the columns to standardize them for merging and calculation
    df_base.rename(columns={'HP_kWh': 'HP_kWh_base'}, inplace=True)
    df_compare.rename(columns={'HP_kWh': 'HP_kWh_compare'}, inplace=True)
    
    # Merge on timestamp
    merged_df = pd.merge(df_base, df_compare, on='timestamp')
    
    # Calculate variance and percent difference
    merged_df['variance'] = (merged_df['HP_kWh_base'] - merged_df['HP_kWh_compare']).abs()
    merged_df['percent_difference'] = (merged_df['variance'] / merged_df[['HP_kWh_base', 'HP_kWh_compare']].mean(axis=1)) * 100
    
    # Get top 5 percent differences
    top_percent_differences = merged_df.sort_values(by='percent_difference', ascending=False).head(5)
    print(f"Top 5 percent differences between {base_label} and {compare_label}:")
    print(top_percent_differences[['timestamp', 'HP_kWh_base', 'HP_kWh_compare', 'variance', 'percent_difference']])
    print()  # Print a newline for better separation

# Load and preprocess data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load files and prepare the data frames
df_base = load_and_preprocess('FNSB_B1.csv', 24293)
df_salcha = load_and_preprocess('Salcha_C1.csv', 476)
df_fbx = load_and_preprocess('Fairbanks_C1.csv', 16703)
df_np = load_and_preprocess('North Pole_C1.csv', 7114)

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Assuming timestamp is aligned and exists in all datasets
df_C1['HP_kWh'] = df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh']

# Perform comparison
analyze_differences(df_base, df_C1, 'FNSB_B1', 'Combined_C1')


Top 5 percent differences between FNSB_B1 and Combined_C1:
               timestamp   HP_kWh_base  HP_kWh_compare      variance  \
1438 2018-03-01 22:30:00  2.754826e+06   427412.270882  2.327414e+06   
1439 2018-03-01 23:30:00  2.754826e+06   428840.000279  2.325986e+06   
1437 2018-03-01 21:30:00  2.718801e+06   426108.280674  2.292693e+06   
1709 2018-03-13 05:30:00  1.800526e+06   283717.981386  1.516808e+06   
1711 2018-03-13 07:30:00  1.800526e+06   283717.981386  1.516808e+06   

      percent_difference  
1438          146.275268  
1439          146.119980  
1437          145.803429  
1709          145.549945  
1711          145.549945  



In [2]:
import pandas as pd

# Load and preprocess the data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load the datasets
df1 = load_and_preprocess('FNSB_B1.csv', 24293)
dfbin1 = load_and_preprocess('FNSB_Bin1.csv', 23016)
dfbin2 = load_and_preprocess('FNSB_Bin2.csv', 1263)
dfbin3 = load_and_preprocess('FNSB_Bin3.csv', 14)

# Combine the bins into a single DataFrame
df2 = pd.DataFrame()
df2['timestamp'] = dfbin1['timestamp']
df2['HP_kWh'] = dfbin1['HP_kWh'] + dfbin2['HP_kWh'] + dfbin3['HP_kWh']

# Merge the dataframes on timestamp
merged_df = pd.merge(df1, df2, on='timestamp', suffixes=('_b1', '_b3'))

# Calculate the absolute variance and percent difference
merged_df['variance'] = (merged_df['HP_kWh_b1'] - merged_df['HP_kWh_b3']).abs()
merged_df['percent_difference'] = merged_df['variance'] / merged_df[['HP_kWh_b1', 'HP_kWh_b3']].mean(axis=1) * 100

# Sort by percent difference in descending order and select the top 5
top_percent_differences = merged_df.sort_values(by='percent_difference', ascending=False).head(5)

# Display the results
print(top_percent_differences[['timestamp', 'HP_kWh_b1', 'HP_kWh_b3', 'variance', 'percent_difference']])


               timestamp     HP_kWh_b1     HP_kWh_b3       variance  \
191  2018-01-08 23:30:00  1.776661e+06  1.676849e+06   99811.732868   
7751 2018-11-19 23:30:00  1.755723e+06  1.658226e+06   97497.740750   
1436 2018-03-01 20:30:00  2.618136e+06  2.473597e+06  144539.250931   
2140 2018-03-31 04:30:00  2.618136e+06  2.473597e+06  144539.250931   
1734 2018-03-14 06:30:00  2.618136e+06  2.473597e+06  144539.250931   

      percent_difference  
191             5.780306  
7751            5.711728  
1436            5.677408  
2140            5.677408  
1734            5.677408  


In [4]:
import pandas as pd

# Load and preprocess the data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load the datasets
df1 = load_and_preprocess('FNSB_B1.csv', 24293)
dfbin1 = load_and_preprocess('FNSB_Bin1.csv', 23016)
dfbin2 = load_and_preprocess('FNSB_Bin2.csv', 1263)
dfbin3 = load_and_preprocess('FNSB_Bin3.csv', 14)

# Combine HP_kWh data from different bins
df2 = pd.DataFrame()
df2['timestamp'] = dfbin1['timestamp']
df2['HP_kWh'] = dfbin1['HP_kWh'] + dfbin2['HP_kWh'] + dfbin3['HP_kWh']

# Merge the dataframes on timestamp
merged_df = pd.merge(df1, df2, on='timestamp', suffixes=('_b1', '_b3'))

# Sum the total HP_kWh for b1 and b3
total_HP_kWh_b1 = merged_df['HP_kWh_b1'].sum()
total_HP_kWh_b3 = merged_df['HP_kWh_b3'].sum()

# Calculate the absolute variance and percent difference
variance = abs(total_HP_kWh_b1 - total_HP_kWh_b3)
percent_difference = (variance / ((total_HP_kWh_b1 + total_HP_kWh_b3) / 2)) * 100

# Calculate the variance and percent difference for each row
merged_df['row_variance'] = (merged_df['HP_kWh_b1'] - merged_df['HP_kWh_b3']).abs()
merged_df['row_percent_difference'] = merged_df['row_variance'] / merged_df[['HP_kWh_b1', 'HP_kWh_b3']].mean(axis=1) * 100



# Display the results
print(f"Total HP_kWh_b1: {total_HP_kWh_b1}")
print(f"Total HP_kWh_b3: {total_HP_kWh_b3}")
print(f"Variance: {variance}")
print(f"Percent Difference: {percent_difference:.2f}%")




Total HP_kWh_b1: 6479794627.753269
Total HP_kWh_b3: 6398057239.171097
Variance: 81737388.5821724
Percent Difference: 1.27%
