# Battery Degradation Analysis of NASA Li-On Batteries

This data-set was cleaned and pre-processed to a certain extent on the Kaggle website. The analysis below intends to pick up further from the cleaned data to conduct further cleaning and exploratory data analysis.

## Load MetaData

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import os
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

data_folder = '/Users/aparnakakarlapudi/Desktop/Practicum/cleaned_dataset'
meta_data = pd.read_csv(os.path.join(data_folder, "metadata.csv"))

In [50]:
meta_data

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct
0,discharge,2010-07-21 15:00:35,4,B0047,0,1,00001.csv,1.6743047446975208,,
1,impedance,2010-07-21 16:53:45,24,B0047,1,2,00002.csv,,0.05605783343888099,0.20097016584458333
2,charge,2010-07-21 17:25:40,4,B0047,2,3,00003.csv,,,
3,impedance,2010-07-21 20:31:05,24,B0047,3,4,00004.csv,,0.05319185850921101,0.16473399914864734
4,discharge,2010-07-21 21:02:56,4,B0047,4,5,00005.csv,1.5243662105099023,,
...,...,...,...,...,...,...,...,...,...,...
7560,impedance,2010-09-30 07:36:45,24,B0055,247,7561,07561.csv,,0.0968087979207628,0.15489738203707232
7561,discharge,2010-09-30 08:08:36,4,B0055,248,7562,07562.csv,1.0201379996149256,,
7562,charge,2010-09-30 08:48:54,4,B0055,249,7563,07563.csv,,,
7563,discharge,2010-09-30 11:50:17,4,B0055,250,7564,07564.csv,0.9907591663373165,,


In [51]:
actual_data = pd.read_csv(os.path.join(data_folder, "data/00002.csv"))
actual_data

Unnamed: 0,Sense_current,Battery_current,Current_ratio,Battery_impedance,Rectified_Impedance
0,(928.3472290039062-48.499576568603516j),(228.7861328125-70.94481658935547j),(3.76172584438583+0.9544956717274271j),(0.19021741554080737+0.07913959666077047j),(0.17493022756754967-0.02331644173631698j)
1,(922.0059814453125-52.15481185913086j),(233.29783630371094-67.84492492675781j),(3.70383574337748+0.8535511912426126j),(0.012482111540534805+0.03664975726191017j),(0.16866854345060991-0.024717137242854174j)
2,(920.0405883789062-52.19926452636719j),(236.62075805664062-67.22105407714844j),(3.6558702379340415+0.8179848126025503j),(0.030461237041852884-0.08529386061204824j),(0.16107808875912483-0.024114907529065532j)
3,(914.2005615234375-52.371726989746094j),(242.2973175048828-66.45936584472656j),(3.564190338159781+0.7614698525522733j),(0.14229365598447172-0.12192216177062407j),(0.14924381603096853-0.02372905366581296j)
4,(911.2781372070312-45.477928161621094j),(246.62347412109375-65.4150161743164j),(3.4978434135716947+0.7433743116536402j),(0.24388219819020493-0.08033187245076714j),(0.14049811021506395-0.022851601640846698j)
5,(907.116943359375-46.32453536987305j),(251.11593627929688-65.69110107421875j),(3.426140647581949+0.7117924049476518j),(0.303109266599245-0.01100015291136107j),(0.1311573351226623-0.02242361032794129j)
6,(899.554443359375-43.06085968017578j),(259.12646484375-67.00004577636719j),(3.2942234027295925+0.6855813018072434j),(0.32709058487432013+0.05145705088260462j),(0.12304603255413803-0.0195099331278606j)
7,(891.207275390625-43.172672271728516j),(266.564697265625-68.21566772460938j),(3.176714891409828+0.6509828833170116j),(0.33089967640944123+0.09355670635190765j),(0.11344810854459421-0.017644032709853628j)
8,(885.2266845703125-43.83500289916992j),(273.4709777832031-69.10552978515625j),(3.080782312186481+0.6182158425280606j),(0.3254359075820158+0.11409318785479901j),(0.10803339194768323-0.01638562897701378j)
9,(877.4107666015625-43.043678283691406j),(281.1755065917969-67.8630599975586j),(2.9836528086978147+0.5670342101181585j),(0.31603270085729296+0.1168731107735024j),(0.10034982474252357-0.013454518333835989j)


## Clean the data

In [6]:
data = '/Users/aparnakakarlapudi/Desktop/Practicum/cleaned_dataset/data'
data_cleaned = '/Users/aparnakakarlapudi/Desktop/Practicum/cleaned_dataset/data_cleaned'
data_files = os.listdir(data)

## Feature Engineering

In [19]:
impedance_data = meta_data[meta_data['type'] == 'impedance']
discharge_data = meta_data[meta_data['type'] == 'discharge']
charge_data = meta_data[meta_data['type'] == 'charge']
impedance_data

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct,id
1,impedance,[2010. 7. 21. 16. 53. ...,24,B0047,1,2,00002.csv,,0.05605783343888099,0.20097016584458333,00002
3,impedance,[2010 7 21 20 31 5],24,B0047,3,4,00004.csv,,0.05319185850921101,0.16473399914864734,00004
13,impedance,[2010. 7. 22. 17. 3. ...,24,B0047,13,14,00014.csv,,0.05963791501051059,0.21039872263834902,00014
15,impedance,[2010. 7. 22. 20. 40. 25.5],24,B0047,15,16,00016.csv,,0.05512505361624278,0.1754882075917004,00016
17,impedance,[2010. 7. 23. 11. 35. ...,24,B0047,17,18,00018.csv,,0.058878485312444453,0.19095687096090014,00018
...,...,...,...,...,...,...,...,...,...,...,...
7536,impedance,[2010. 9. 28. 16. 5. ...,24,B0055,223,7537,07537.csv,,0.09747109901247247,0.15760912100719615,07537
7546,impedance,[2010. 9. 29. 8. 16. ...,24,B0055,233,7547,07547.csv,,0.0982550032520246,0.14877104776241656,07547
7548,impedance,[2010. 9. 29. 11. 53. ...,24,B0055,235,7549,07549.csv,,0.09778722154159296,0.15088524149658153,07549
7558,impedance,[2010. 9. 30. 3. 59. ...,24,B0055,245,7559,07559.csv,,0.09867936363638956,0.15274663175569908,07559


In [20]:
impedance_files = impedance_data['filename'].values
discharge_files = discharge_data['filename'].values
charge_files = charge_data['filename'].values
impedance_files

array(['00002.csv', '00004.csv', '00014.csv', ..., '07549.csv',
       '07559.csv', '07561.csv'], dtype=object)

In [45]:
def concat_files(files,data_folder):
    random_number_generator = np.random.RandomState(0)
    selected_dfs = []
    for file in tqdm(files):
        df = pd.read_csv(os.path.join(data_folder, "data", file))
        row = random_number_generator.randint(len(df)//2, len(df))
        time = df.iloc[row]['Time']
        rul = df.iloc[-1]['Time'] - time

        agg_features = ['mean','std']
        aggs = df.iloc[:row].agg(agg_features).to_dict()
        features = df.iloc[row].to_dict()
        features['id'] = f"{file.split('.')[0]}"
        features['RUL'] = rul
        for k, v in aggs.items():
            if k == "Time":
                continue   
            for kk, vv in v.items():
                features[f"{k}_{kk}"] = vv
        selected_dfs.append(pd.DataFrame([features]))

    df = pd.concat(selected_dfs)
    print(f'Number of rows: {len(df)}')
    df = df.dropna()
    print(f'Number of rows after dropping nan: {len(df)}')

    return df

discharge_df = concat_files(discharge_files,data_folder)
charge_df = concat_files(charge_files,data_folder)

100%|██████████| 2794/2794 [00:23<00:00, 118.61it/s]


Number of rows: 2794
Number of rows after dropping nan: 2794


100%|██████████| 2815/2815 [00:27<00:00, 100.88it/s]


Number of rows: 2815
Number of rows after dropping nan: 2815


In [52]:
impedance_files = impedance_data['filename'].values
def process_impedance_files(impedance_files, data_folder):
    selected_dfs = []
    for file in tqdm(impedance_files):
        df = pd.read_csv(os.path.join(data_folder, "data", file))
        # Directly parse complex numbers without string manipulation
        df['Sense_current'] = df['Sense_current'].apply(lambda x: complex(x))
        df['Battery_current'] = df['Battery_current'].apply(lambda x: complex(x))
        df['Current_ratio'] = df['Current_ratio'].apply(lambda x: complex(x))
        df['Battery_impedance'] = df['Battery_impedance'].apply(lambda x: complex(x))
        df['Rectified_impedance'] = df['Rectified_Impedance'].apply(lambda x: complex(x))
        
        # Calculate magnitude for the complex impedance features
        df['Re_magnitude'] = df['Battery_impedance'].apply(lambda x: np.abs(x))
        df['Rct_magnitude'] = df['Rectified_impedance'].apply(lambda x: np.abs(x))
        
        # Extract id from filename
        features = {
            'id': file.split('.')[0],
            'Re_magnitude': df['Re_magnitude'].mean(),
            'Rct_magnitude': df['Rct_magnitude'].mean()
        }
        
        selected_dfs.append(pd.DataFrame([features]))

    impedance_df = pd.concat(selected_dfs, ignore_index=True)
    return impedance_df

# Example usage
impedance_df = process_impedance_files(impedance_files, data_folder)

100%|██████████| 1956/1956 [00:12<00:00, 162.83it/s]


In [53]:
impedance_df
#discharge_df
#charge_df

Unnamed: 0,id,Re_magnitude,Rct_magnitude
0,00002,0.213929,0.084121
1,00004,0.217082,0.084427
2,00014,0.222013,0.089369
3,00016,0.221105,0.088679
4,00018,0.219014,0.087305
...,...,...,...
1951,07537,0.274595,0.125688
1952,07547,0.272369,0.122961
1953,07549,0.273414,0.123927
1954,07559,0.274221,0.124881


In [43]:
charge_df['cycle_type'] = 0
discharge_df['cycle_type'] = 1

# Now combine them
cycle_data = pd.concat([charge_df, discharge_df], ignore_index=True)

# Optional: Drop unnecessary columns if present
columns_to_keep = ['id', 'RUL', 'cycle_type', 'Voltage_measured_mean', 'Voltage_measured_std',
                   'Current_measured_mean', 'Current_measured_std', 'Temperature_measured_mean',
                   'Temperature_measured_std', 'Current_load_mean', 'Current_load_std',
                   'Voltage_load_mean', 'Voltage_load_std', 'Current_charge_mean', 
                   'Current_charge_std', 'Voltage_charge_mean', 'Voltage_charge_std']

cycle_data = cycle_data[columns_to_keep].dropna()

# Ensure 'id' in cycle_data matches the format in impedance_df and meta_data for successful merging
cycle_data['id'] = cycle_data['id'].apply(lambda x: x.zfill(5))  # Adjusts '1' to '00001', if necessary


In [39]:
# Strip '.csv' from 'filename' in meta_data to match 'id' in impedance_df
meta_data['id'] = meta_data['filename'].str.replace('.csv', '', regex=False)

# Now merge using the 'id' column
full_data = pd.merge(impedance_data, impedance_df, on='id', how='left')

# Proceed with the rest of your data processing and modeling

# Merge impedance data with meta_data using 'filename' or another appropriate key
full_data = pd.merge(impedance_data, impedance_df, on='id', how='left')

# Assuming 'cycle_data' is a DataFrame with aggregated features from charge/discharge cycles
# Ensure cycle_data has an 'id' column for merging
full_data = pd.merge(full_data, cycle_data, on='id', how='left')

# Select features and target variable for model training
# Ensure RUL is present in full_data
feature_columns = [col for col in full_data.columns if col not in ['RUL', 'id', 'battery_id', 'test_id', 'uid', 'filename', 'type', 'start_time', 'ambient_temperature', 'Capacity']]
X = full_data[feature_columns]
y = full_data['RUL']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the full feature set
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'MAE: {mae}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  impedance_data['id'] = impedance_data['filename'].str.replace('.csv', '', regex=False)


ValueError: could not convert string to float: '(0.08241814902922012-0.022224149442637422j)'

In [44]:
cycle_data

Unnamed: 0,id,RUL,cycle_type,Voltage_measured_mean,Voltage_measured_std,Current_measured_mean,Current_measured_std,Temperature_measured_mean,Temperature_measured_std,Current_load_mean,Current_load_std,Voltage_load_mean,Voltage_load_std,Current_charge_mean,Current_charge_std,Voltage_charge_mean,Voltage_charge_std
