<a href="https://colab.research.google.com/github/aashi-chatterjee/EV-battery-health-prediction/blob/main/battery_health_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import scipy.io
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter
import matplotlib.pyplot as plt
import seaborn as sns
import os
sns.set_style('whitegrid')

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
COLAB_FILE_PATH = '/content/drive/MyDrive/NASA_Battery_Data'

Loading The Data:

In [43]:
def load_nasa_mat_file(battery_name, file_path='.'):
    mat_file_path = os.path.join(file_path, f"{battery_name}.mat")
    print(f"Loading {mat_file_path}...")

    try:
        mat_file = scipy.io.loadmat(mat_file_path)
    except FileNotFoundError:
        print(f"Error: {battery_name}.mat not found in {file_path}")
        print("Please ensure the .mat files are in the specified directory.")
        return pd.DataFrame()

    try:
        # Access the nested struct: mat_file['B0005'][0, 0]['cycle'][0]
        cycle_array = mat_file[battery_name][0, 0]['cycle'][0]
    except KeyError:
        print(f"Error: Could not find key '{battery_name}' in the .mat file.")
        return pd.DataFrame()

    all_cycle_data_list = []

    for i in range(len(cycle_array)):
        cycle = cycle_array[i]
        mode_array = cycle['type']
        while isinstance(mode_array, np.ndarray):
            #to get the first item
            mode_array = mode_array.flat[0]
        mode = str(mode_array)

        data_struct = cycle['data'][0, 0]

        # Skip if data is empty
        if 'Voltage_measured' not in data_struct.dtype.names:
            continue

        df_cycle = pd.DataFrame({
            'voltage_charger': data_struct['Voltage_measured'].flatten(),
            'current_charger': data_struct['Current_measured'].flatten(),
            'relative_time': data_struct['Time'].flatten(),
            'temperature': data_struct['Temperature_measured'].flatten()
        })

        df_cycle['cycle_number'] = i + 1

        if mode == 'charge':
            df_cycle['mode'] = 1
        elif mode == 'discharge':
            df_cycle['mode'] = -1
        else:
            df_cycle['mode'] = 0

        all_cycle_data_list.append(df_cycle)

    if not all_cycle_data_list:
        print(f"Warning: No valid cycle data found for {battery_name}.")
        return pd.DataFrame()

    final_df = pd.concat(all_cycle_data_list, ignore_index=True)
    final_df['battery_name'] = battery_name.replace('B00', 'B')

    print(f"Successfully loaded and parsed {battery_name}. {final_df.shape[0]} data points.")
    return final_df

In [44]:

df_B5_raw = load_nasa_mat_file('B0005', file_path=COLAB_FILE_PATH)
df_B7_raw = load_nasa_mat_file('B0007', file_path=COLAB_FILE_PATH)
df_B18_raw = load_nasa_mat_file('B0018', file_path=COLAB_FILE_PATH)

df_raw = pd.concat([df_B5_raw, df_B7_raw, df_B18_raw], ignore_index=True)

print(f"\nLoaded {df_raw.shape[0]} total raw data points.")
print("-" * 50)

Loading /content/drive/MyDrive/NASA_Battery_Data/B0005.mat...
Successfully loaded and parsed B0005. 591458 data points.
Loading /content/drive/MyDrive/NASA_Battery_Data/B0007.mat...
Successfully loaded and parsed B0007. 591458 data points.
Loading /content/drive/MyDrive/NASA_Battery_Data/B0018.mat...
Successfully loaded and parsed B0018. 314676 data points.

Loaded 1497592 total raw data points.
--------------------------------------------------


In [45]:
df_raw.head()

Unnamed: 0,voltage_charger,current_charger,relative_time,temperature,cycle_number,mode,battery_name
0,3.873017,-0.001201,0.0,24.655358,1,1,B05
1,3.479394,-4.030268,2.532,24.66648,1,1,B05
2,4.000588,1.512731,5.5,24.675394,1,1,B05
3,4.012395,1.509063,8.344,24.693865,1,1,B05
4,4.019708,1.511318,11.125,24.705069,1,1,B05


In [46]:
df_raw.isnull().sum()

Unnamed: 0,0
voltage_charger,2
current_charger,2
relative_time,0
temperature,2
cycle_number,0
mode,0
battery_name,0


In [52]:
print(df_raw['mode'].unique())
print(df_raw.head())

[ 1 -1]
   voltage_charger  current_charger  relative_time  temperature  cycle_number  \
0         3.873017        -0.001201          0.000    24.655358             1   
1         3.479394        -4.030268          2.532    24.666480             1   
2         4.000588         1.512731          5.500    24.675394             1   
3         4.012395         1.509063          8.344    24.693865             1   
4         4.019708         1.511318         11.125    24.705069             1   

   mode battery_name  
0     1          B05  
1     1          B05  
2     1          B05  
3     1          B05  
4     1          B05  


FEATURE ENGINEERING (dQ/dV Calculation):



In [54]:
print("Starting dQ/dV feature engineering...")

def calculate_dqdv(cycle_data, V_start=3.0, V_end=4.2, num_points=200):
    # Calculateing the dQ/dV curve for a single charge cycle.

    #Filtering for charge phase
    charge_data = cycle_data[cycle_data['mode'] == 1].copy()
    if charge_data.empty:
        return np.full(num_points, np.nan)

    #Sorting and compute capacity (Q)
    charge_data.sort_values(by='voltage_charger', inplace=True)
    time_diff = charge_data['relative_time'].diff().fillna(0)
    current = charge_data['current_charger']
    charge_data['Q'] = (current * time_diff / 3600).cumsum()

    #Dropping duplicates
    df_clean = charge_data.drop_duplicates(subset=['voltage_charger'], keep='first').copy()
    V = df_clean['voltage_charger'].values
    Q = df_clean['Q'].values

    MIN_POINTS = 10
    if len(V) < MIN_POINTS:
        return np.full(num_points, np.nan)

    #Interpolating Q(V) onto a uniform grid
    V_grid = np.linspace(V_start, V_end, num_points)
    Q_interp_func = interp1d(V, Q, kind='linear', bounds_error=False, fill_value="extrapolate")
    Q_interp = Q_interp_func(V_grid)

    #Computing dQ/dV using Savitzkyâ€“Golay smoothing derivative
    window_length = 5
    polyorder = 3

    if np.isnan(Q_interp).all():
        return np.full(num_points, np.nan)
    else:
        Q_interp = np.nan_to_num(Q_interp, nan=0)
        dQdV = savgol_filter(Q_interp, window_length, polyorder, deriv=1, delta=V_grid[1] - V_grid[0])

    return dQdV

def calculate_target_capacity(cycle_data):
    # Calculating the discharged capacity (Ah) for a single discharge cycle.
    discharge_data = cycle_data[cycle_data['mode'] == -1]
    if discharge_data.empty:
        return np.nan

    time_diff = discharge_data['relative_time'].diff().fillna(0)
    current = discharge_data['current_charger']
    capacity = (current * time_diff / 3600).sum() * -1

    if capacity > 1.0:
        return capacity
    else:
        return np.nan

all_features_list = []
all_cycle_info_list = []
rejection_reasons = {'Bad dQ/dV': 0, 'Bad Target': 0, 'Empty Discharge': 0, 'Other': 0}

grouped = df_raw.groupby(['battery_name', 'cycle_number', 'mode'])

battery_cycle_features = {}
battery_cycle_capacity = {}

# Computing dQ/dV and capacities separately
for (battery_name, cycle_num, mode), cycle_data in grouped:
    try:
        if mode == 1:  # Charge cycle
            dQdV_vector = calculate_dqdv(cycle_data)
            if not np.isnan(dQdV_vector).all():
                battery_cycle_features[(battery_name, cycle_num)] = dQdV_vector
            else:
                rejection_reasons['Bad dQ/dV'] += 1

        elif mode == -1:  # Discharge cycle
            target_capacity = calculate_target_capacity(cycle_data)
            if not np.isnan(target_capacity) and target_capacity > 0:
                battery_cycle_capacity[(battery_name, cycle_num)] = target_capacity
            else:
                rejection_reasons['Bad Target'] += 1

    except Exception as e:
        rejection_reasons['Other'] += 1
        continue

# Pairing each charge cycle with the following discharge
for (battery_name, cycle_num), dQdV_vector in battery_cycle_features.items():
    next_discharge = [
        key for key in battery_cycle_capacity.keys()
        if key[0] == battery_name and key[1] >= cycle_num
    ]
    if next_discharge:
        discharge_key = sorted(next_discharge, key=lambda x: x[1])[0]
    else:
        discharge_key = None

    if discharge_key:
        capacity = battery_cycle_capacity[discharge_key]
        all_features_list.append(dQdV_vector)
        all_cycle_info_list.append({
            'Battery': battery_name,
            'Cycle': cycle_num,
            'Capacity': capacity
        })
    else:
        rejection_reasons['Empty Discharge'] += 1

X_dQdV_features = np.array(all_features_list)
df_cycle_info = pd.DataFrame(all_cycle_info_list)
df_features = pd.concat([df_cycle_info.reset_index(drop=True),
                         pd.DataFrame(X_dQdV_features)], axis=1)

total_rejects = sum(rejection_reasons.values())

print("\n--- FEATURE ENGINEERING SUMMARY ---")
print(f"Total cycles rejected: {total_rejects}")
print("Rejection Breakdown:")
print(pd.Series(rejection_reasons))
print(f"Feature engineering complete. {df_features.shape[0]} valid cycles processed.")
print(f"Feature matrix shape (X): {X_dQdV_features.shape}")
print("-" * 50)

Starting dQ/dV feature engineering...

--- FEATURE ENGINEERING SUMMARY ---
Total cycles rejected: 2
Rejection Breakdown:
Bad dQ/dV          2
Bad Target         0
Empty Discharge    0
Other              0
dtype: int64
Feature engineering complete. 472 valid cycles processed.
Feature matrix shape (X): (472, 200)
--------------------------------------------------
