In [11]:
import pandas as pd

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Rename the columns to standardize them for merging and calculation
    df_base.rename(columns={'HP_kWh': 'HP_kWh_base'}, inplace=True)
    df_compare.rename(columns={'HP_kWh': 'HP_kWh_compare'}, inplace=True)
    
    # Merge on timestamp
    merged_df = pd.merge(df_base, df_compare, on='timestamp')
    
    # Calculate variance and percent difference
    merged_df['variance'] = (merged_df['HP_kWh_base'] - merged_df['HP_kWh_compare']).abs()
    merged_df['percent_difference'] = (merged_df['variance'] / merged_df[['HP_kWh_base', 'HP_kWh_compare']].mean(axis=1)) * 100
    
    # Get top 5 percent differences
    top_percent_differences = merged_df.sort_values(by='percent_difference', ascending=False).head(5)
    print(f"Top 5 percent differences between {base_label} and {compare_label}:")
    print(top_percent_differences[['timestamp', 'HP_kWh_base', 'HP_kWh_compare', 'variance', 'percent_difference']])
    print()  # Print a newline for better separation

# Function to load and preprocess the data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load files and prepare the data frames
df1 = load_and_preprocess('FNSB_B1.csv', 24293)
df2 = load_and_preprocess('KPB_B1.csv', 9730)
df3 = load_and_preprocess('Mat_Su_B1.csv', 31715)
df4 = load_and_preprocess('ANC_B1.csv', 14654)
df5 = load_and_preprocess('Denali_B1.csv', 2403)
df_salcha = load_and_preprocess('Salcha_C1.csv', 476)
df_fbx = load_and_preprocess('Fairbanks_C1.csv', 16703)
df_np = load_and_preprocess('North Pole_C1.csv', 7114)
df_ken = load_and_preprocess('Kenai_C1.csv', 4077)
df_nik = load_and_preprocess('Nikiski_C1.csv', 554)
df_sol = load_and_preprocess('Soldotna_C1.csv', 5099)
df_pal = load_and_preprocess('Palmer_C1.csv', 9778)
df_hous = load_and_preprocess('Houston_C1.csv', 597)
df_was = load_and_preprocess('Wasilla_C1.csv', 21340)
df_anc = load_and_preprocess('ANC_B1.csv', 14654)
df_den = load_and_preprocess('Denali_B1.csv', 2403)

# Creating a combined DataFrame for B1 locations
df_base = pd.DataFrame()
df_base['timestamp'] = df1['timestamp']
df_base['HP_kWh'] = (df1['HP_kWh'] + df2['HP_kWh'] + df3['HP_kWh'] + 
                     df4['HP_kWh'] + df5['HP_kWh'])

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Assuming timestamp is aligned and exists in all datasets
df_C1['HP_kWh'] = (df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh'] + 
                   df_ken['HP_kWh'] + df_nik['HP_kWh'] + df_sol['HP_kWh'] + 
                   df_pal['HP_kWh'] + df_hous['HP_kWh'] + df_was['HP_kWh'])

# Perform comparison
analyze_differences(df_base, df_C1, 'Combined_B1', 'Combined_C1')

Top 5 percent differences between Combined_B1 and Combined_C1:
               timestamp    HP_kWh_base  HP_kWh_compare       variance  \
8099 2018-12-04 11:30:00  830881.800000             0.0  830881.800000   
8105 2018-12-04 17:30:00  830881.800000             0.0  830881.800000   
5441 2018-08-15 17:30:00      94.759107             0.0      94.759107   
5439 2018-08-15 15:30:00     201.503238             0.0     201.503238   
8100 2018-12-04 12:30:00  830881.800000             0.0  830881.800000   

      percent_difference  
8099               200.0  
8105               200.0  
5441               200.0  
5439               200.0  
8100               200.0  



In [7]:
import pandas as pd

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Calculate the sum of HP_kWh for each DataFrame
    sum_base = df_base['HP_kWh'].sum()
    sum_compare = df_compare['HP_kWh'].sum()
    
    # Calculate percent difference
    variance = abs(sum_base - sum_compare)
    percent_difference = (variance / ((sum_base + sum_compare) / 2)) * 100
    
    print(f"Total HP_kWh for {base_label}: {sum_base}")
    print(f"Total HP_kWh for {compare_label}: {sum_compare}")
    print(f"Variance between {base_label} and {compare_label}: {variance}")
    print(f"Percent difference between {base_label} and {compare_label}: {percent_difference:.2f}%")
    print()
    
    # Calculate row-wise variance and percent differences
    df_base.rename(columns={'HP_kWh': 'HP_kWh_base'}, inplace=True)
    df_compare.rename(columns={'HP_kWh': 'HP_kWh_compare'}, inplace=True)
    merged_df = pd.merge(df_base, df_compare, on='timestamp')
    merged_df['row_variance'] = (merged_df['HP_kWh_base'] - merged_df['HP_kWh_compare']).abs()
    merged_df['row_percent_difference'] = (merged_df['row_variance'] / merged_df[['HP_kWh_base', 'HP_kWh_compare']].mean(axis=1)) * 100
    
    

# Function to load and preprocess the data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load files and prepare the data frames
df1 = load_and_preprocess('FNSB_B1.csv', 24293)
df2 = load_and_preprocess('KPB B1.csv', 9730)
df3 = load_and_preprocess('Mat_Su_B1.csv', 31715)
df4 = load_and_preprocess('ANC_B1.csv', 14654)
df5 = load_and_preprocess('Denali_B1.csv', 2403)
df_salcha = load_and_preprocess('Salcha_C1.csv', 476)
df_fbx = load_and_preprocess('Fairbanks_C1.csv', 16703)
df_np = load_and_preprocess('North Pole_C1.csv', 7114)
df_ken = load_and_preprocess('Kenai_C1.csv', 4077)
df_nik = load_and_preprocess('Nikiski_C1.csv', 554)
df_sol = load_and_preprocess('Soldotna_C1.csv', 5099)
df_pal = load_and_preprocess('Palmer_C1.csv', 9778)
df_hous = load_and_preprocess('Houston_C1.csv', 597)
df_was = load_and_preprocess('Wasilla_C1.csv', 21340)
df_anc = load_and_preprocess('ANC_B1.csv', 14654)
df_den = load_and_preprocess('Denali_B1.csv', 2403)

# Creating a combined DataFrame for B1 locations
df_base = pd.DataFrame()
df_base['timestamp'] = df1['timestamp']
df_base['HP_kWh'] = (df1['HP_kWh'] + df2['HP_kWh'] + df3['HP_kWh'] + 
                     df4['HP_kWh'] + df5['HP_kWh'])

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Assuming timestamp is aligned and exists in all datasets
df_C1['HP_kWh'] = (df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh'] + 
                   df_ken['HP_kWh'] + df_nik['HP_kWh'] + df_sol['HP_kWh'] + 
                   df_pal['HP_kWh'] + df_hous['HP_kWh'] + df_was['HP_kWh'] + df_anc['HP_kWh'] + df_den['HP_kWh'])

# Perform comparison
analyze_differences(df_base, df_C1, 'Combined_B1', 'Combined_C1')


Total HP_kWh for Combined_B1: 33702264189.74153
Total HP_kWh for Combined_C1: 14344500283.49129
Variance between Combined_B1 and Combined_C1: 19357763906.250244
Percent difference between Combined_B1 and Combined_C1: 80.58%

