In [3]:
import pickle
import numpy as np
import math, time
import pandas as pd
import joblib
import dill

In [None]:
df = joblib.load(r"..\data\preprocessed_encoded_data.pkl")
#long session features cuh
sensitive_features = {0:["User Type"], 
                      #1:["Charger Type"], 
                      2:["Vehicle Age (years)"], 
                      3:["Battery Capacity (kWh)"],
                      4:["Vehicle Model_BMW i3", "Vehicle Model_Hyundai Kona", "Vehicle Model_Nissan Leaf", "Vehicle Model_Tesla Model 3"],
                      5:["Charging Station Location_Chicago", "Charging Station Location_Houston", "Charging Station Location_Los Angeles", "Charging Station Location_New York", "Charging Station Location_San Francisco"]}

#regression features
cost_regression_features = ["Battery Capacity (kWh)", "Charging Rate (kW)", "Energy Consumed (kWh)", "Vehicle Age", "Time of Day"]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9320 entries, 0 to 9319
Data columns (total 26 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Battery Capacity (kWh)                    9320 non-null   float64
 1   Energy Consumed (kWh)                     9320 non-null   float64
 2   Charging Rate (kW)                        9320 non-null   float64
 3   Charging Time Difference (minutes)        9320 non-null   int64  
 4   Time of Day                               9320 non-null   int64  
 5   Day of Week                               9320 non-null   int64  
 6   State of Charge (Start %)                 9320 non-null   float64
 7   State of Charge (End %)                   9320 non-null   float64
 8   Distance Driven (since last charge) (km)  9320 non-null   float64
 9   Temperature (Â°C)                          9320 non-null   float64
 10  Vehicle Age (years)                

In [6]:
splits = joblib.load("../data/preprocessed_splits.pkl")

#Long Session
X_train_long_session = splits["Long Session"]["X_train"]
y_train_long_session = splits["Long Session"]["y_train"]
X_test_long_session = splits["Long Session"]["X_test"]
y_test_long_session = splits["Long Session"]["y_test"]
#cost regression
X_train_cost_regression = splits["Charging Cost (USD)"]["X_train"]
y_train_cost_regression = splits["Charging Cost (USD)"]["y_train"]
X_test_cost_regression = splits["Charging Cost (USD)"]["X_test"]
y_test_cost_regression = splits["Charging Cost (USD)"]["y_test"]


In [30]:
long_session = joblib.load(r"..\saved_models\long_session.pkl")
cost_regression = joblib.load(r"..\saved_models\cost_regression.pkl")

## Long Session Classification

to find bias, calculate dir
disparate impact ratio = **p**(protectedgroup)/**p**(privileged group)

Normal is 1, and long session is 0

In [None]:
def calculate_probabilities(df):
    normal, long = df["Long Session"].value_counts()
    total = normal+long
    Pnormal = normal/(total)
    Plong = long/(total)

    probability_df = pd.DataFrame({
        "Normal": [normal],
        "Pnormal": [Pnormal],
        "Long":[Plong],
        "Plong":[Plong]
    })
    
    return probability_df
def find_DP(protected_group, priviledged_group):
    DP = (abs(protected_group-priviledged_group).iloc[0])
    if DP < 0.05:
        print("Demographic Party is Fair")
    else:
        print("Demographic Party is Unfair")
def find_DIR(protected_group, priviledged_group):
    DIR = float((protected_group/priviledged_group).iloc[0])
    if DIR < 0.8:
        print("Data is biased against the protected group")
    elif DIR > 1.25:
        print("Data is biased in favor of the protected group")
    else:
        print("No bias")

In [None]:
sensitive_map = {
    "Vehicle Age (years)": "continuous",
    "Battery Capacity (kWh)": "continuous",
    "Distance Driven (since last charge) (km)": "continuous",
    "User Type": "categorical", 
    "Vehicle Model_BMW i3": "binary", #one hot encodded features output 0 and 1sssss so they are binary
    "Charging Station Location_Chicago": "binary"
}




In [34]:
def calculate_fairness(df, features):
    for name, type in features.items():
        if type == "continuous":
            threshold = df[name].median()
            privileged_subset = df[name] >= threshold
            protected_subset = df[name] <= threshold
            group_names = (f">= {name} (Median)", f"< {name}")
        elif type == 'categorical':
            top_cat = df[name].mode()[0]
            privileged_subset = df[name] == top_cat
            protected_subset = df[name] != top_cat
            group_names = (f"{top_cat}", f"Not {top_cat}")
        else: #for the onehot encoded sutff
            privileged_subset = df[name] == 1
            protected_subset = df[name] == 0
            group_names = ("Present (1)", "Absent (0)")
        
        df_priv = df[privileged_subset]
        df_prot = df[protected_subset]
        print("="*100)
        print(f"Analyzing {name}")
        prob_priv_df = calculate_probabilities(df_priv)[['Plong']]
        prob_prot_df = calculate_probabilities(df_prot)[['Plong']]

        find_DP(prob_prot_df['Plong'], prob_priv_df['Plong'])
        find_DIR(prob_prot_df['Plong'], prob_priv_df['Plong'])
        #print("\n")

In [35]:
calculate_fairness(df, sensitive_map)

Analyzing Vehicle Age (years)
Demographic Party is Fair
No bias
Analyzing Battery Capacity (kWh)
Demographic Party is Unfair
Data is biased against the protected group
Analyzing Distance Driven (since last charge) (km)
Demographic Party is Fair
No bias
Analyzing User Type
Demographic Party is Fair
No bias
Analyzing Vehicle Model_BMW i3
Demographic Party is Fair
No bias
Analyzing Charging Station Location_Chicago
Demographic Party is Fair
No bias


In [36]:
probability_df = calculate_probabilities(df)
find_DIR(probability_df["Plong"], probability_df["Pnormal"])
find_DP(probability_df["Plong"], probability_df["Pnormal"])

Data is biased against the protected group
Demographic Party is Unfair


### Equalized Odds

In [37]:
from sklearn.metrics import recall_score, roc_curve
from fair_metrics import get_selection_rate, disparate_impact_ratio, demographic_parity_difference, eo_difference, get_equalized_odds

In [60]:
thresholds = {
    0: 0.8,
    1: 0.5,
    2: 0.6
}
class EOD:
    def __init__(self, thresholds, df, feature):
        self.thresholds = thresholds
        self.df = df
        self.feature = feature
        self.apply_thresholding()
        self.calculate_EODs()
    def apply_thresholding(self):
        self.df['y_pred_adjusted'] = 0
        # Apply thresholds per group
        for g, tau in self.thresholds.items():
            mask = (self.df[self.feature] == g) & (self.df['y_pred_proba'] > tau)
            self.df.loc[mask, 'y_pred_adjusted'] = 1
    def calculate_EODs(self):
        tpr1, fpr1 = get_equalized_odds(
            self.df, self.feature, 1, "y_true", "y_pred_adjusted"
        )

        # Step 3: compute EO for group 0
        tpr0, fpr0 = get_equalized_odds(
            self.df, self.feature, 0, "y_true", "y_pred_adjusted"
        )

        # Step 4: EO violation
        eo_tpr_gap = abs(tpr0 - tpr1)
        eo_fpr_gap = abs(fpr0 - fpr1)
        #display(self.df.head(1))
        #print("TPR gap:", eo_tpr_gap, "FPR gap:", eo_fpr_gap)
        metric_df = pd.DataFrame({
            "TPR gap": [eo_tpr_gap],
            "FPR gap": [eo_fpr_gap],
            "tpr1": [tpr1],
            "fpr1": [fpr1],
            "tpr0":[tpr0],
            "fpr0":[fpr0]
        })
        display(metric_df)


### Mitigation Steps

#### Preprocessing Stage

In [39]:
#oversampling
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

In [40]:
X = df.drop(columns="Long Session")
y = df["Long Session"]

In [41]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X, y)
print(f"Original Training Class Distribution: {Counter(y)}")
print(f"Smote Training Class Distribution: {Counter(y_train_smote)}")
smote_df = pd.concat([X_train_smote, y_train_smote],axis=1)
calculate_fairness(smote_df, sensitive_map)

Original Training Class Distribution: Counter({1: 5874, 0: 3446})
Smote Training Class Distribution: Counter({1: 5874, 0: 5874})
Analyzing Vehicle Age (years)
Demographic Party is Fair
No bias
Analyzing Battery Capacity (kWh)
Demographic Party is Fair
No bias
Analyzing Distance Driven (since last charge) (km)
Demographic Party is Fair
No bias
Analyzing User Type
Demographic Party is Fair
No bias
Analyzing Vehicle Model_BMW i3
Demographic Party is Unfair
No bias
Analyzing Charging Station Location_Chicago
Demographic Party is Fair
No bias


In [45]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_long_session, y_train_long_session)
print(f"Original Training Class Distribution: {Counter(y_train_long_session)}")
print(f"Smote Training Class Distribution: {Counter(y_train_smote)}")
smote_df = pd.concat([X_train_smote, y_train_smote],axis=1)
calculate_fairness(smote_df, sensitive_map)

Original Training Class Distribution: Counter({1: 4687, 0: 2769})
Smote Training Class Distribution: Counter({0: 4687, 1: 4687})
Analyzing Vehicle Age (years)
Demographic Party is Fair
No bias
Analyzing Battery Capacity (kWh)
Demographic Party is Fair
No bias
Analyzing Distance Driven (since last charge) (km)
Demographic Party is Fair
No bias
Analyzing User Type
Demographic Party is Fair
No bias
Analyzing Vehicle Model_BMW i3
Demographic Party is Fair
No bias
Analyzing Charging Station Location_Chicago
Demographic Party is Fair
No bias


In [64]:
def train_model(x_train,x_test, y_train, y_test, features):
    for i, cols in features.items():
        print("-" * 100)
        print(f"Removing {cols}")
        X_train_rem = x_train.drop(columns=cols)
        long_session.fit(X_train_rem,  y_train)
        print("Finished Training")
        
        X_test1 = x_test.drop(columns=cols)
        y_pred1 = long_session.predict(X_test1)
        y_pred_proba1 = long_session.predict_proba(X_test1)[:, 1]
        X_test1['y_true'] = y_test.values
        X_test1['y_pred'] = y_pred1
        X_test1['y_pred_proba'] = y_pred_proba1
        
        eod = EOD(thresholds, X_test1, "User Type")
def train_model_with_sample_weight(x_train,x_test, y_train, y_test, features, test_thresholds):
    for i, cols in features.items():
        print("-" * 100)
        print(f"Removing {cols}")
        X_train_rem = x_train.drop(columns=cols)
        weight_map = {
            0: 1,
            1: 3,
            2: 2
        }
        
        sample_weights = X_train_long_session["User Type"].map(weight_map)
        long_session.fit(X_train_rem, y_train,sample_weight=sample_weights)
        print("Finished Training")
        
        X_test1 = x_test.drop(columns=cols)
        y_pred1 = long_session.predict(X_test1)
        y_pred_proba1 = long_session.predict_proba(X_test1)[:, 1]
        X_test1['y_true'] = y_test.values
        X_test1['y_pred'] = y_pred1
        X_test1['y_pred_proba'] = y_pred_proba1
        
        eod = EOD(test_thresholds, X_test1, "User Type")

In [65]:
sensitive_features = {#0:["User Type"], 
                      1:["Charger Type"], 
                      2:["Vehicle Age (years)"], 
                      3:["Battery Capacity (kWh)"],
                      4:["Vehicle Model_BMW i3"],
                      5:["Charging Station Location_Chicago"]}

In [66]:
train_model(X_train_long_session, X_test_long_session, y_train_long_session, y_test_long_session, sensitive_features)
print('='*200)
train_model(X_train_smote, X_test_long_session, y_train_smote, y_test_long_session, sensitive_features)


----------------------------------------------------------------------------------------------------
Removing ['Charger Type']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.412045,0.512073,0.775681,0.628352,0.363636,0.116279


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Age (years)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.394048,0.529515,0.796646,0.628352,0.402597,0.098837


----------------------------------------------------------------------------------------------------
Removing ['Battery Capacity (kWh)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.589316,0.597679,0.849057,0.766284,0.25974,0.168605


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Model_BMW i3']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.375589,0.597033,0.765199,0.655172,0.38961,0.05814


----------------------------------------------------------------------------------------------------
Removing ['Charging Station Location_Chicago']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.396553,0.515771,0.786164,0.643678,0.38961,0.127907


----------------------------------------------------------------------------------------------------
Removing ['Charger Type']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.425236,0.591219,0.769392,0.655172,0.344156,0.063953


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Age (years)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.433622,0.552504,0.777778,0.651341,0.344156,0.098837


----------------------------------------------------------------------------------------------------
Removing ['Battery Capacity (kWh)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.682193,0.625033,0.821803,0.747126,0.13961,0.122093


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Model_BMW i3']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.397778,0.57576,0.748428,0.651341,0.350649,0.075581


----------------------------------------------------------------------------------------------------
Removing ['Charging Station Location_Chicago']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.446813,0.498998,0.771488,0.586207,0.324675,0.087209


#### In Processing


In [61]:
test_thresholds = {
    0: 0.5,
    1: 0.5,
    2: 0.5
}
train_model_with_sample_weight(X_train_long_session, X_test_long_session, y_train_long_session, y_test_long_session, sensitive_features, test_thresholds)

----------------------------------------------------------------------------------------------------
Removing ['Charger Type']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.078446,0.0603,0.750524,0.64751,0.672078,0.587209


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Age (years)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.104141,0.100998,0.792453,0.64751,0.688312,0.546512


----------------------------------------------------------------------------------------------------
Removing ['Battery Capacity (kWh)']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.119687,0.177092,0.84696,0.770115,0.727273,0.593023


----------------------------------------------------------------------------------------------------
Removing ['Vehicle Model_BMW i3']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.080876,0.180277,0.775681,0.662835,0.694805,0.482558


----------------------------------------------------------------------------------------------------
Removing ['Charging Station Location_Chicago']
Finished Training


Unnamed: 0,TPR gap,FPR gap,tpr1,fpr1,tpr0,fpr0
0,0.066943,0.206963,0.771488,0.701149,0.704545,0.494186


## Cost Regression

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
def find_MPD(df, col):
    mean_preds = {}
    diff = {}
    groups = list(df[col].value_counts().index)
    for g in groups:
        subset_df = df[df[col] == g]
        y_pred_g = cost_regression.predict(subset_df)
        mean = float(y_pred_g.mean())
        mean_preds[g] = mean
    print(f"Means: {mean_preds}")
    for i in range(len(groups)):
        for j in range(i+1,len(groups)):
            g1 = groups[i]
            g2 = groups[j]
            diff[(g1,g2)] = abs(mean_preds[g1] - mean_preds[g2])
    return pd.DataFrame.from_dict(diff, orient="index")
def find_REP(df, y_true, col):
    MAEs = {}
    groups = list(df[col].value_counts().index)
    for g in groups:
        subset_df = df[df[col] == g]
        indices = subset_df.index
        y_pred_g = cost_regression.predict(subset_df)
        mae = mean_absolute_error(y_true.loc[indices], y_pred_g)
        MAEs[g] = mae
    print(MAEs)
    return pd.DataFrame.from_dict(MAEs, orient="Index")
def find_RPP(df, y_true, col):
    R2s = {}
    groups = list(df[col].value_counts().index)
    for g in groups:
        subset_df = df[df[col] == g]
        indices = subset_df.index
        y_pred_g = cost_regression.predict(subset_df)
        r2 = r2_score(y_true.loc[indices], y_pred_g)
        R2s[g] = r2
    print(R2s)
    return pd.DataFrame.from_dict(R2s, orient="Index")
def find_AVG(df, y_true, col):
    AVGs = {}
    groups = list(df[col].value_counts().index)
    for g in groups:
        subset_df = df[df[col] == g]
        y_pred_g = cost_regression.predict(subset_df)
        mean = float(y_pred_g.mean())
        AVGs[g] = mean

In [None]:
indices = X_train_cost_regression[X_train_cost_regression["User Type"] == 1].index
y_train_cost_regression.loc[indices].mean()
for i in range(3):
    indices = X_train_cost_regression[X_train_cost_regression["User Type"] == i].index
    print(y_train_cost_regression.loc[indices].mean())

11.040878794174665
10.79329660092902
10.820685503054131


In [112]:
MPDS = find_MPD(X_test_cost_regression, "User Type")
REPS = find_REP(X_test_cost_regression, y_test_cost_regression, "User Type")
RPPS = find_RPP(X_test_cost_regression, y_test_cost_regression, "User Type")
display(MPDS, REPS, RPPS)

Means: {1: 10.833821296691895, 2: 10.167403221130371, 0: 11.720026016235352}
{1: 0.08997791966022728, 2: 0.056078490616660134, 0: 0.05597419725286951}
{1: 0.9925962330902491, 2: 0.9990500376882796, 0: 0.9989247006072586}


Unnamed: 0,0
"(1, 2)",0.666418
"(1, 0)",0.886205
"(2, 0)",1.552623


Unnamed: 0,0
1,0.089978
2,0.056078
0,0.055974


Unnamed: 0,0
1,0.992596
2,0.99905
0,0.998925


Commuter = 1, long-distance traveler = 2, and casual driver = 0

### Modelling