In [2]:
import pandas as pd

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Rename the columns to standardize them for merging and calculation
    df_base.rename(columns={'HP_kWh': 'HP_kWh_base'}, inplace=True)
    df_compare.rename(columns={'HP_kWh': 'HP_kWh_compare'}, inplace=True)
    
    # Merge on timestamp
    merged_df = pd.merge(df_base, df_compare, on='timestamp')
    
    # Calculate variance and percent difference
    merged_df['variance'] = (merged_df['HP_kWh_base'] - merged_df['HP_kWh_compare']).abs()
    merged_df['percent_difference'] = (merged_df['variance'] / merged_df[['HP_kWh_base', 'HP_kWh_compare']].mean(axis=1)) * 100
    
    # Get top 5 percent differences
    top_percent_differences = merged_df.sort_values(by='percent_difference', ascending=False).head(5)
    print(f"Top 5 percent differences between {base_label} and {compare_label}:")
    print(top_percent_differences[['timestamp', 'HP_kWh_base', 'HP_kWh_compare', 'variance', 'percent_difference']])
    print()  # Print a newline for better separation

# Load and preprocess data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Load files and prepare the data frames
df_base = load_and_preprocess('FNSB_B1.csv', 24293)
df_salcha = load_and_preprocess('Salcha_C1.csv', 476)
df_fbx = load_and_preprocess('Fairbanks_C1.csv', 16703)
df_np = load_and_preprocess('North Pole_C1.csv', 7114)

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Assuming timestamp is aligned and exists in all datasets
df_C1['HP_kWh'] = df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh']

# Perform comparison
analyze_differences(df_base, df_C1, 'FNSB_B1', 'Combined_C1')


Top 5 percent differences between FNSB_B1 and Combined_C1:
               timestamp   HP_kWh_base  HP_kWh_compare      variance  \
1438 2018-03-01 22:30:00  2.754826e+06   427412.270882  2.327414e+06   
1439 2018-03-01 23:30:00  2.754826e+06   428840.000279  2.325986e+06   
1437 2018-03-01 21:30:00  2.718801e+06   426108.280674  2.292693e+06   
1709 2018-03-13 05:30:00  1.800526e+06   283717.981386  1.516808e+06   
1711 2018-03-13 07:30:00  1.800526e+06   283717.981386  1.516808e+06   

      percent_difference  
1438          146.275268  
1439          146.119980  
1437          145.803429  
1709          145.549945  
1711          145.549945  



In [4]:
import pandas as pd

# Function to load and preprocess the data
def load_and_preprocess(file_path, scale_factor):
    df = pd.read_csv(file_path)
    df['HP_kWh'] *= scale_factor
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Ensure timestamp is in datetime format
    return df

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Merge on timestamp
    merged_df = pd.merge(df_base, df_compare, on='timestamp', suffixes=('_base', '_compare'))
    
    # Calculate the sum of HP_kWh for each DataFrame
    sum_base = merged_df['HP_kWh_base'].sum()
    sum_compare = merged_df['HP_kWh_compare'].sum()
    
    # Calculate percent difference
    variance = abs(sum_base - sum_compare)
    percent_difference = (variance / ((sum_base + sum_compare) / 2)) * 100
    
    # Calculate row-wise variance and percent difference
    merged_df['row_variance'] = (merged_df['HP_kWh_base'] - merged_df['HP_kWh_compare']).abs()
    merged_df['row_percent_difference'] = merged_df['row_variance'] / merged_df[['HP_kWh_base', 'HP_kWh_compare']].mean(axis=1) * 100
    
    # Sort by percent difference in descending order and select the top 5
    top_percent_differences = merged_df.sort_values(by='row_percent_difference', ascending=False).head(5)
    
    print(f"Total HP_kWh for {base_label}: {sum_base}")
    print(f"Total HP_kWh for {compare_label}: {sum_compare}")
    print(f"Variance between {base_label} and {compare_label}: {variance}")
    print(f"Percent difference between {base_label} and {compare_label}: {percent_difference:.2f}%")
    print()
    

# Load files and prepare the data frames
df_base = load_and_preprocess('FNSB_B1.csv', 24293)
df_salcha = load_and_preprocess('Salcha_C1.csv', 476)
df_fbx = load_and_preprocess('Fairbanks_C1.csv', 16703)
df_np = load_and_preprocess('North Pole_C1.csv', 7114)

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Align timestamp
df_C1['HP_kWh'] = df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh']

# Perform comparison
analyze_differences(df_base, df_C1, 'FNSB_B1', 'Combined_C1')


Total HP_kWh for FNSB_B1: 6479794627.753269
Total HP_kWh for Combined_C1: 1972762933.4505243
Variance between FNSB_B1 and Combined_C1: 4507031694.302745
Percent difference between FNSB_B1 and Combined_C1: 106.64%



In [1]:
import pandas as pd

# Function to load, merge, and calculate variances and percent differences
def analyze_differences(df_base, df_compare, base_label, compare_label):
    # Calculate the sum of HP_kWh for each DataFrame
    sum_base = df_base['HP_kWh'].sum()
    sum_compare = df_compare['HP_kWh'].sum()
    
    # Calculate percent difference
    variance = abs(sum_base - sum_compare)
    percent_difference = (variance / ((sum_base + sum_compare) / 2)) * 100
    
    print(f"Total HP_kWh for {base_label}: {sum_base}")
    print(f"Total HP_kWh for {compare_label}: {sum_compare}")
    print(f"Variance between {base_label} and {compare_label}: {variance}")
    print(f"Percent difference between {base_label} and {compare_label}: {percent_difference:.2f}%")
    print()

# Load files and prepare the data frames
df_base = pd.read_csv('FNSB_B1.csv')
df_salcha = pd.read_csv('Salcha_C1.csv')
df_fbx = pd.read_csv('Fairbanks_C1.csv')
df_np = pd.read_csv('North Pole_C1.csv')

# Print the sizes of the data frames
print(f"Size of df_base: {df_base.shape}")
print(f"Size of df_salcha: {df_salcha.shape}")
print(f"Size of df_fbx: {df_fbx.shape}")
print(f"Size of df_np: {df_np.shape}")

# Scale kWh by the number of units
df_base['HP_kWh'] *= 24293
df_salcha['HP_kWh'] *= 476
df_fbx['HP_kWh'] *= 16703
df_np['HP_kWh'] *= 7114

# Print the intermediate sums
print(f"Sum of HP_kWh in df_base: {df_base['HP_kWh'].sum()}")
print(f"Sum of HP_kWh in df_salcha: {df_salcha['HP_kWh'].sum()}")
print(f"Sum of HP_kWh in df_fbx: {df_fbx['HP_kWh'].sum()}")
print(f"Sum of HP_kWh in df_np: {df_np['HP_kWh'].sum()}")

# Creating a combined DataFrame for C1 locations
df_C1 = pd.DataFrame()
df_C1['timestamp'] = df_salcha['timestamp']  # Align timestamp 
df_C1['HP_kWh'] = df_salcha['HP_kWh'] + df_fbx['HP_kWh'] + df_np['HP_kWh']

# Check for missing values
print(f"Missing values in df_base: {df_base.isnull().sum().sum()}")
print(f"Missing values in df_C1: {df_C1.isnull().sum().sum()}")

# Perform comparison
analyze_differences(df_base, df_C1, 'FNSB_B1', 'Combined_C1')


Size of df_base: (8760, 4)
Size of df_salcha: (8760, 4)
Size of df_fbx: (8760, 4)
Size of df_np: (8760, 4)
Sum of HP_kWh in df_base: 6479794627.753269
Sum of HP_kWh in df_salcha: 1652483.2909952358
Sum of HP_kWh in df_fbx: 43742232.46418024
Sum of HP_kWh in df_np: 27322155.5200162
Missing values in df_base: 0
Missing values in df_C1: 0
Total HP_kWh for FNSB_B1: 6479794627.753269
Total HP_kWh for Combined_C1: 72716871.27519168
Variance between FNSB_B1 and Combined_C1: 6407077756.478078
Percent difference between FNSB_B1 and Combined_C1: 195.56%

