In [1]:
import pandas as pd
import numpy as np
import glob
import os
from scipy.stats import linregress
from sklearn.decomposition import PCA

In [2]:
#Start by getting the list of freuencies that EIS is measured at.
#Create a list of column names that we will transform the data to. 
df_forCols=pd.read_csv('./Data/Lithium/EIS_data/EIS_state_I_25C01.txt', sep='\t')
f_vals=df_forCols['freq/Hz'][:60].tolist()
#Col_names will be my new column labels
Col_names_Re = []
Col_names_Im = []

for freq in f_vals:
    Re_str = str(freq)+' Re'
    Im_str = str(freq)+' Im'
    Col_names_Re.append(Re_str)
    Col_names_Im.append(Im_str)


In [3]:
###### Cleans the Capacity Data.  ######
##Generating the Capacity dataframe for use below. 

# Get txt files list from EIS folder
path = './Data/Lithium/Capacity_data'
txt_files = glob.glob(path + "/*.txt")
#Generate a list of the individual dataframes
df_list_Cap=[]
for file in txt_files:
    fname= file.split('Data_Capacity_')[-1][:-4]
    df_1 = pd.read_csv(file, sep='\t')
    df_2 = pd.concat([df_1['time/s'],df_1['cycle number'],df_1['ox/red'],df_1['Capacity/mA.h']],axis=1)
    #Adding a label column with a value based on the file name
    df_2['label']=str(fname)
    df_list_Cap.append(df_2)

#Concatenate all of the dataframes together into one large dataframe
df_Cap=df_list_Cap[0]
for i in range(1,len(df_list_Cap)):
    df_Cap= pd.concat([df_Cap, df_list_Cap[i]], axis=0, ignore_index=True)

#Check if their published data is only for 0??
sub_df=df_Cap[(df_Cap['ox/red']==0)]
#Generates the dataframe with the max capacity for each label and cycle number.
grouped_data = sub_df.groupby(['label','cycle number'])
max_Cap_grouped = grouped_data['Capacity/mA.h'].max()
max_Cap_grouped = max_Cap_grouped.reset_index()
#max_Cap_grouped.shape

#Normalize capacity based on first cycle for each label.
df = max_Cap_grouped[['label','cycle number','Capacity/mA.h']]

#rename cycle 0 in capacity data to cycle 1 so that there is a 1-1 correspondance in EIS data.
df['cycle number']=df['cycle number'].apply(lambda x: int(x + 1.))
U_labels = list(df['label'].unique())
norm_df=pd.DataFrame(columns=['label','cycle number','Capacity/mA.h','Norm_Cap'])
for label in U_labels:
    try:
        selected_rows = df[(df['label'] == label)]
        Cap1=selected_rows.iloc[0]['Capacity/mA.h']
        normalizedCap=selected_rows['Capacity/mA.h'].apply(lambda x: x/Cap1)
        normalizedCap=normalizedCap.rename('Norm_Cap')     
        tempdf = selected_rows.merge(normalizedCap,left_index=True, right_index=True)  
        norm_df = pd.concat([norm_df,tempdf],ignore_index=True)    
    except:
        pass

    
#Generate Cycles to Fail Column and add it to the dataframe
U_labels = list(norm_df['label'].unique())
CtF_df=pd.DataFrame(columns=['label','cycle number','Capacity/mA.h','Norm_Cap','Cycles Until Fail'])
for label in U_labels:
    selected_rows = norm_df[(norm_df['label'] == label)]
    failed_rows = selected_rows[selected_rows['Norm_Cap']<=0.8]
    if len(failed_rows)==0:
        print('Failure Cycle not Reached for',label,'Slope Method Used.')
        #if the battery does not reach the failure point of 0.8, we have to extrapolate
        #For some of the data, linregress gives an extrapolated value which is much too high
        #These batteries with extrapolated failure points should not be used to train regression. 
        y=selected_rows['Norm_Cap'].to_numpy()
        x=selected_rows['cycle number'].to_numpy()
        x = x[:].astype(np.float32)
        #slope does a linear regression on the full set of x,y data points.
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        Fail_C = (.8-intercept)/slope        


    else:
        Fail_C=failed_rows.iloc[0]['cycle number']
    
    Fail_C = int(Fail_C)
    cyc_toFail=selected_rows['cycle number'].apply(lambda x: Fail_C-x)
    cyc_toFail=cyc_toFail.rename('Cycles Until Fail')     
    tempdf = selected_rows.merge(cyc_toFail,left_index=True, right_index=True)  
    CtF_df = pd.concat([CtF_df,tempdf],ignore_index=True)       

Failure Cycle not Reached for 25C04 Slope Method Used.


In [4]:
###### Cleans the data and creates set of EIS and PCA csv's for each state of charge.  ######

#Create new folder for the cleaned data if it does not already exist.
if not os.path.exists('./Cleaned_Data'):
    os.makedirs('./Cleaned_Data')

#List of all of the named state of charges.
SoC_lst = ['I','II','III','IV','V','VI','VII','VIII','IX']

# Get txt files list from EIS folder
path = './Data/Lithium/EIS_data'
txt_files = glob.glob(path + "/*.txt")

for SoC in SoC_lst:
    SoC_files = []
    for files in txt_files:
        if str('_'+SoC+'_') in files:
            SoC_files.append(files)
        else:
            pass
        
    #Generate a list of the individual dataframes
    df_list=[]
    for file in SoC_files:
        fname= file.split(str('EIS_state_'+SoC+'_'))[-1][:-4]
        #The published data has headers for some data files. Other have no header. Need to check. 
        df_temp=pd.read_csv(file, sep='\t',header=None)
        #if statement to remove the first row if it was a header. The check is if the first character is t (time)
        if str(df_temp.iloc[0][0])[0]=='t':
            df_temp = df_temp[1:]
        else:
            pass
        #Adding this line because the labels have random spaces in front or after. Standardizing column names.
        df_temp.columns = ['time/s', 'cycle number', 'freq/Hz', 'Re(Z)/Ohm', '-Im(Z)/Ohm', '|Z|/Ohm', 'Phase(Z)/deg']
        #Adding a label column with a value based on the file name
        df_temp['label']=str(fname)
        df_list.append(df_temp)

    #Concatenate all of the dataframes together into one large dataframe
    df_EIS=df_list[0]
    for i in range(1,len(df_list)):
        df_EIS= pd.concat([df_EIS, df_list[i]], axis=0, ignore_index=True)
        

    Col_names = ['label','cycle number']+Col_names_Re+Col_names_Im
    
    
    U_labels = list(df_EIS['label'].unique())
    U_cycles = list(df_EIS['cycle number'].unique())

    #Here I am changing the format of the dataframe so that each impedance response at each frequency has its own column
    U_labels = list(df_EIS['label'].unique())
    #U_cycles = list(df_EIS['cycle number'].unique())
    sq_df=pd.DataFrame(columns=Col_names)
    for label in U_labels:
        label_df = df_EIS[(df_EIS['label'] == label)]
        U_cycles = list(label_df['cycle number'].unique())
        for cycle in U_cycles:
            try:
                selected_rows = label_df[(label_df['cycle number'] == cycle)]
                Re_data=selected_rows['Re(Z)/Ohm'].tolist()
                Im_data=selected_rows['-Im(Z)/Ohm'].tolist()
                Imp_data=[label,int(float(cycle))]+Re_data+Im_data
                sq_df.loc[len(sq_df.index)] = Imp_data 
            except:
                print('Error at',SoC,label,' cycle: ',cycle)
                if not len(Imp_data)==122:
                    print('Missing EIS Frequencies in Data for this cycle.')
                else:
                    pass
            
            
    #Take only the data where there is overlap with the capacity data. 
    new_df2 = CtF_df.merge(sq_df,on=['label','cycle number'],how='inner')
    
    #Get the EIS part of the df only so that we can do PCA.
    EIS_Only_df = new_df2.drop(['label','cycle number','Capacity/mA.h','Norm_Cap','Cycles Until Fail'],axis=1)
    
    #Define the PCA 
    #We do not need to use a standard scalar here - all are measurements of impedance response.
    #Take only the two most important dimensions
    pca = PCA(2)
    #Perform the PCA fit and transform the features into that reduced dimensional space
    
    principalComponents = pca.fit_transform(EIS_Only_df)

    #Put it into a pandas df
    Reduced_df = pd.DataFrame(data = principalComponents)
    Reduced_df.columns =['EIS_PCA_1','EIS_PCA_2']
    
    Full_df = new_df2.join(Reduced_df)
    Full_df.to_csv(str('./Cleaned_Data/'+SoC+'_Full_Data_wPCA.csv'),index=False)
    
    print(str('Finished with '+SoC))
    print(Full_df.shape)


Error at I 25C05  cycle:   329.00000
Missing EIS Frequencies in Data for this cycle.
Finished with I
(2741, 127)
Finished with II
(2600, 127)
Finished with III
(2599, 127)
Error at IV 25C05  cycle:   276.00000
Missing EIS Frequencies in Data for this cycle.
Finished with IV
(2597, 127)
Finished with V
(2597, 127)
Finished with VI
(1308, 127)
Finished with VII
(424, 127)
Finished with VIII
(424, 127)
Finished with IX
(2737, 127)
