In [1]:
import pandas as pd

def process_data(file_path):
    # 1. Read the CSV
    df = pd.read_csv(file_path)

    # 2. Extract and split the embeddings
    def process_embedding(embed_str):
        # Split the string based on spaces
        numbers = embed_str.strip('[]').split()
        # Remove leading quotation marks and convert the split strings into floats
        return [float(num.strip("'")) for num in numbers]

    df['Embedding'] = df['Embedding'].apply(process_embedding)

    # 3. Expand the embedding lists into separate columns
    expanded_df = df['Embedding'].apply(pd.Series)

    # Replace all empty cells with NaN first
    expanded_df.replace("", float('nan'), inplace=True)

    # Fill NaNs with 0
    expanded_df.fillna(0, inplace=True)

    # 4. Rename the columns
    expanded_df.columns = [f"embedding_{i}" for i in range(expanded_df.shape[1])]

    # 5. Concatenate the expanded embedding DataFrame with the 'Label' column from the original DataFrame
    final_df = pd.concat([expanded_df, df['Label']], axis=1)
    
    # 6. Save the DataFrame as a new CSV with "cleaned_" prefix
    cleaned_path = "cleaned_" + file_path.split("\\")[-1]
    final_df.to_csv(cleaned_path, index=False)
    return final_df

# Function to pad dataframes to have the same number of columns
def pad_dataframe(df1, df2):
    # Determine the number of columns in each DataFrame
    num_cols_df1 = df1.shape[1]
    num_cols_df2 = df2.shape[1]
    
    # Determine which DataFrame has fewer columns
    if num_cols_df1 > num_cols_df2:
        difference = num_cols_df1 - num_cols_df2
        for i in range(difference):
            df2[f"padding_{i}"] = 0
    elif num_cols_df2 > num_cols_df1:
        difference = num_cols_df2 - num_cols_df1
        for i in range(difference):
            df1[f"padding_{i}"] = 0
    
    return df1, df2

# Process the first dataset
attack_file_path = "C:\\Users\\willi\\attack_correlated_masquerade_all_w4_off_4.csv"
attack_df = process_data(attack_file_path)
print("Processed Attack DataFrame:\n", attack_df.head())

# Process the second dataset
benign_file_path = "C:\\Users\\willi\\benign_all_w10_off_10.csv"
benign_df = process_data(benign_file_path)
print("\nProcessed Benign DataFrame:\n", benign_df.head())

# Pad dataframes to have the same number of columns
attack_df, benign_df = pad_dataframe(attack_df, benign_df)




Processed Attack DataFrame:
    embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0     0.015923     0.004297     0.022409     0.001141    -0.001638   
1     0.011219    -0.001700     0.017852     0.004617    -0.016653   
2     0.011257    -0.009086     0.018594    -0.011629    -0.008642   
3     0.007918    -0.008054     0.013820     0.007241    -0.009087   
4     0.002480     0.000253     0.016922    -0.003681    -0.001770   

   embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
0    -0.016256     0.003681     0.002536    -0.002612    -0.000507  ...   
1    -0.014433     0.011837     0.005016    -0.011273     0.006535  ...   
2    -0.018634     0.008368     0.011641    -0.006298     0.000335  ...   
3    -0.016383     0.008663     0.014826    -0.011623    -0.011860  ...   
4    -0.013876    -0.000698    -0.007030    -0.003385     0.000332  ...   

   embedding_99  embedding_100  embedding_101  embedding_102  embedding_103  \
0           0.0     

In [2]:
# Check the final shape to confirm the number of columns
print("\nAfter Padding:")
print("Attack DataFrame shape:", attack_df.shape)
print("Benign DataFrame shape:", benign_df.shape)


After Padding:
Attack DataFrame shape: (27, 109)
Benign DataFrame shape: (2180, 109)
