In [1]:
from sklearn.feature_selection import mutual_info_regression 
import pandas as pd 
import numpy as np 

In [6]:
df = pd.read_csv('ev_charging_sessions.csv')
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['hour_start'] = df['start_time'].dt.hour
df['hour_end'] = df['end_time'].dt.hour
df['start_date'] = df['start_time'].dt.date
df['end_date'] = df['end_time'].dt.date

X = df.copy() 
X.dropna(subset='energy_kWh', axis=0, inplace=True)
y = X.energy_kWh 
X.drop('energy_kWh', axis=1, inplace=True)
X.drop('start_time', axis=1, inplace=True)
X.drop('end_time', axis=1, inplace=True)

In [7]:
X.dtypes

session_id      object
user_id         object
vehicle_id      object
station_id      object
duration_min     int64
session_day     object
session_type    object
hour_start       int32
hour_end         int32
start_date      object
end_date        object
dtype: object

In [11]:
print(X.shape[0], y.shape[0])
X.head()

3500 3500


Unnamed: 0,session_id,user_id,vehicle_id,station_id,duration_min,session_day,session_type,hour_start,hour_end,start_date,end_date
0,CS0001,U339,V347,S091,77,Weekday,Occasional,12,13,2024-11-11,2024-11-11
1,CS0002,U286,V463,S025,97,Weekend,Emergency,19,21,2024-11-10,2024-11-10
2,CS0003,U092,V419,S007,117,Weekend,Regular,18,20,2024-11-26,2024-11-26
3,CS0004,U369,V070,S008,109,Weekday,Emergency,19,21,2024-11-28,2024-11-28
4,CS0005,U185,V298,S037,79,Weekend,Occasional,13,14,2024-11-27,2024-11-27


In [28]:
def factorize_all(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col], _ = pd.factorize(df[col])
    return df

X = factorize_all(X)
y, _ = pd.factorize(y)

print(X.dtypes, y.dtype)

session_id      int64
user_id         int64
vehicle_id      int64
station_id      int64
duration_min    int64
session_day     int64
session_type    int64
hour_start      int32
hour_end        int32
start_date      int64
end_date        int64
dtype: object int64


In [31]:
def get_mi_scores(X, y):
    X = X.copy()

    # Treat only known categorical fields as discrete
    discrete_cols = ['session_type', 'session_day']

    # Factorize these if necessary (not strictly needed since they are already ints)
    for col in discrete_cols:
        if col in X.columns:
            X[col], _ = pd.factorize(X[col])

    # Discrete mask (True only for categorical features)
    discrete_features = [col in discrete_cols for col in X.columns]

    # Compute MI
    scores = mutual_info_regression(
        X,
        y,
        discrete_features=discrete_features,
        random_state=0
    )

    scores = pd.Series(scores, name='MI Scores', index=X.columns)
    return scores.sort_values(ascending=False)


get_mi_scores(X, y)

session_id      4.686635
vehicle_id      0.289215
user_id         0.266036
hour_start      0.009972
station_id      0.008921
duration_min    0.000000
session_day     0.000000
session_type    0.000000
hour_end        0.000000
start_date      0.000000
end_date        0.000000
Name: MI Scores, dtype: float64

In [32]:
get_mi_scores(X[['duration_min', 'hour_start', 'hour_end', 'session_day', 'session_type']], y)

hour_start      0.011292
duration_min    0.000000
hour_end        0.000000
session_day     0.000000
session_type    0.000000
Name: MI Scores, dtype: float64