# Imports

In [5]:
# importing all the necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import os

# Files and folders

In [6]:
# path to the dataset folder
cleaned_data = "./cleaned_dataset"
# the experiments data
data = f"{cleaned_data}/data"
# metadata
metadata_path = f"{cleaned_data}/metadata.csv"


# Useful Functions

In [32]:
# Extracts the real and imaginary parts from a complex impedance string.
# The input is expected to be in the form "a+bi" or "a-bi".
# Returns the negative absolute values of both real and imaginary parts.
def extract_real_imaginary(imp):
    # Determine the delimiter used for separating real and imaginary parts
    delimiter = "+" if "+" in imp else "-"
    # Remove leading and trailing characters (e.g., brackets or special characters)
    imp = imp[1:-2]
    # Find the index of delimiter, skipping the first character
    delimiter_index = imp.index(delimiter, 1)
    # Extract and convert real part to float
    real = float(imp[:delimiter_index])
    # Extract and convert imaginary part to float
    im = float(imp[delimiter_index+1:])
    # If delimiter is '-', the imaginary part should be negative
    if delimiter == "-":
        im = -im
    return -abs(real), -abs(im)

# Converts a string representation of a timestamp in the format [YYYY MM DD HH MM SS.sss] 
# into a pandas Timestamp object.
def convert_to_timestamp(s):
    s = s.strip("[]")  # Remove surrounding brackets if present
    parts = [float(x) for x in s.split()]  # Convert each space-separated value to float
    
    for i in range(len(parts) - 1):  # Convert all parts except seconds to integers
        parts[i] = int(parts[i])
    
    [year, month, day, hour, minute, seconds] = parts  # Unpack values
    seconds_int = int(seconds)  # Extract integer part of seconds
    microsecond = int(seconds * 1_000_000) % 1_000_000  # Convert fractional seconds to microseconds
    
    return pd.Timestamp(year=year, month=month, day=day, hour=hour, 
                        minute=minute, second=seconds_int, microsecond=microsecond)


# Reads impedance data from a CSV file, extracts real and imaginary components, 
# and sorts the data by real impedance values.
def load_impedance_data(filename):
    file_path = f"{data}/{filename}"  # Construct file path
    df = pd.read_csv(file_path)  # Load data into a DataFrame
    column_name = "Rectified_Impedance"
    
    # Drop rows with NaN values in the specified column
    df.dropna(subset=[column_name], inplace=True)
    
    # Extract real and imaginary parts, storing them in new columns
    df[["Re_Z", "Im_Z"]] = df[column_name].apply(lambda x: pd.Series(extract_real_imaginary(x)))
    
    df.sort_values(by="Re_Z", inplace=True)  # Sort by real impedance
    
    return df[["Re_Z", "Im_Z"]]  # Return DataFrame with extracted impedance values


# Processes impedance data for multiple batteries based on metadata.
# Loads data, adds battery and cycle count metadata, and filters the dataset.
# If a processed file already exists, it is loaded instead of recomputing.
def get_task1_data(metadata):
    path = "./extracted_data/task1.csv"
    
    if os.path.isfile(path):  # Check if the processed file already exists
        return pd.read_csv(path)
    
    combined_data = []
    
    for _, row in metadata.iterrows():
        impedance_df = load_impedance_data(row["filename"])  # Load impedance data
        
        # Add battery ID and cycle count to the DataFrame
        impedance_df["battery_id"] = row["battery_id"]
        impedance_df["cycle_count"] = row["cycle_count"]
        
        combined_data.append(impedance_df)  # Append to list
    
    # Combine all data into a single DataFrame
    task1 = pd.concat(combined_data, ignore_index=True)
    
    # Apply filtering based on real and imaginary impedance values
    task1 = task1[task1["Re_Z"] >= -0.3]
    task1 = task1[task1["Re_Z"] <= 0.3]
    task1 = task1[task1["Im_Z"] >= -0.2]
    task1 = task1[task1["Im_Z"] <= 0.2]
    
    # Save the processed data to a CSV file
    task1.to_csv(path, index=False)
    
    return task1  # Return processed DataFrame

# Generates an interactive 3D scatter plot for impedance data.
def plot_task1_data(df):
    # Create the 3D scatter plot
    fig = px.scatter_3d(df, 
                        x='Re_Z', 
                        y='Im_Z', 
                        z='cycle_count', 
                        color='battery_id',  # Different colors for different batteries
                        title='Interactive 3D Impedance Plot',
                        labels={'Re_Z': 'Real Impedance', 'Im_Z': 'Imaginary Impedance', 'cycle_count': 'Cycle Count'})
    
    # fig.update_layout(
    #     scene_camera=dict(
    #         eye=dict(x=0, y=0, z=2)  # z > 1 ensures a clear view from above
    #     )
    # )
    
    # Show the plot
    fig.show()

# Metadata

In [34]:
# load metadata
metadata = pd.read_csv(metadata_path)
# only keep the rows where the type is impedance
metadata = metadata[metadata["type"] == "impedance"]
# convert start_time format from object to timestamp
metadata["start_time"] = metadata["start_time"].apply(convert_to_timestamp)
# first sort the df by battery_id, then by start_time
metadata.sort_values(by=["battery_id", "start_time"], inplace=True)
# group the cycle_count by battery_id
metadata["cycle_count"] = metadata.groupby("battery_id").cumcount() + 1
metadata.head()

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct,cycle_count
5160,impedance,2008-04-18 20:55:29.859,24,B0005,40,5161,05161.csv,,0.0446687003661609,0.0694562730453699,1
5162,impedance,2008-04-18 22:39:16.312,24,B0005,42,5163,05163.csv,,0.0466870016248693,0.0762747409853058,2
5164,impedance,2008-04-19 02:14:27.015,24,B0005,44,5165,05165.csv,,0.044843430573346,0.0679720560130687,3
5166,impedance,2008-04-19 03:57:24.187,24,B0005,46,5167,05167.csv,,0.0461946895509937,0.0745338891843796,4
5168,impedance,2008-04-19 07:32:33.656,24,B0005,48,5169,05169.csv,,0.04510114851143,0.0685283289600032,5


In [35]:
# get the extracted data for the first task
task1 = get_task1_data(metadata)
# plot the data for the first task
plot_task1_data(task1)