In [None]:
# setup for multiple output display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import os
import warnings
import pandas as pd
from sklearn.utils import shuffle

# Read Individual Data Files and Concat to single CSV File

In [None]:
df1 = pd.read_csv("IPA.csv")
df2 = pd.read_csv("Methoxy_Ethanol.csv")
df3 = pd.read_csv("silver_ink_10_.csv")
df4 = pd.read_csv("silver_ink_20_.csv")
df5 = pd.read_csv("silver_ink_30_.csv")
df6 = pd.read_csv("TGME2.csv")
df7 = pd.read_csv("EG_0_50.csv")
df8 = pd.read_csv("Toluene.csv")
df9 = pd.read_csv("Hexanol.csv")
df10 = pd.read_csv("Graphene_Oxide.csv")

dataFrame = [df1,df2,df3,df4,df5,df6,df7,df8,df9, df10]
df = pd.concat(dataFrame)
df_shuffle = shuffle(df)

# Save the DataFrame to a CSV file
df_shuffle.to_csv("complete_dataset.csv", index=False)

In [None]:
def renameFile(old_name, new_name):
    """
    Rename a file and handle conflicts if the new file name already exists.

    Parameters:
    old_name (str): The current name of the file to be renamed.
    new_name (str): The new name for the file.

    Returns:
    None
    """
    # Check if file exists
    if os.path.exists(new_name):
        os.remove(new_name)

    # Rename the file
    try:
      os.rename(old_name,new_name)
      print(f"File '{old_name}' successfully renamed as '{new_name}'.")
    except FileNotFoundError:
      print(f"Error: File '{old_name}' not found.")
    except OSError as e:
      print(f"An error occurred: {e}")

def removeFile(file_path):
    """
    Remove a file from the filesystem.

    Parameters:
    file_path (str): The path to the file to be removed.

    Returns:
    None
    """
    # Remove the file
    try:
      os.remove(file_path)
      print(f"File '{file_path}' successfully removed.")
    except FileNotFoundError:
      print(f"Error: File '{file_path}' not found.")
    except OSError as e:
      print(f"An error occurred: {e}")

def func_class_count(class_counts):
    """
    Calculate the total count from a pandas Series of class counts.

    Parameters:
    class_counts (pandas.Series): A pandas Series containing class labels as keys and their counts as values.

    Returns:
    int: The total sum of all class counts.
    """
    # Convert class_counts Series to a dictionary
    class_counts_dict = class_counts.to_dict()

    # Convert values to integers
    for class_label in class_counts_dict:
        class_counts_dict[class_label] = int(class_counts_dict[class_label])
        
    total = 0
    # Access and use the values
    for class_label, count in class_counts_dict.items():
        total += count
    total= int(total)
    return total

def extract_and_save_rows(input_file, value_to_match, col_index, num_counts):
    """
    Extract rows from a CSV file based on specific column values,
    and save the matched and remaining rows to new CSV files.

    Parameters:
    input_file (str): The path to the input CSV file.
    value_to_match (str): The value to match in the specified column.
    col_index (int): The index of the column to match against.
    num_counts (int): The maximum number of matching rows to extract.

    Returns:
    None
    """
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(input_file)

    # Initialize the count variable
    count = 0

    # Initialize the matched rows and remaining rows dataframes
    matched_rows = pd.DataFrame()

    # Loop through each row in the CSV file
    for index, row in df.iterrows():
        # Check if the value in the specified column is equal to the specified value
        if row[col_index] == value_to_match:
            # Add the matched row to the matched rows dataframe
            matched_rows = matched_rows.append(row)
            # Increment the count variable
            count += 1
            # Drop the remaining row
            df = df.drop(index)
            
        # Check if the number of counts has been reached
        if count == num_counts:
            # Save the matched rows to a new CSV file
            matched_rows.to_csv(value_to_match+'.csv', index=False)
            # Save the remaining rows to a new CSV file
            df.to_csv('remaining_data.csv', index=False)
            # Exit the loop
            break

# Calculate class counts in the Jettable column

In [None]:
input_file = "complete_dataset.csv"

df = pd.read_csv(input_file)

# define ratio of test data to train data 90/10
ratio = 0.2 # size of test dataset - 0.2 = 20%

class_counts_all = df['Jettable'].value_counts()
print ("class count [overall]: ")
print (class_counts_all)

class_counts_test = df['Jettable'].value_counts()*ratio
print ("class count [test]: ")
print (class_counts_test)

print ('Total: ',func_class_count(class_counts_all))
print('Train: ',func_class_count(class_counts_all)-func_class_count(class_counts_test))
print('Test: ',func_class_count(class_counts_test))

# Extract data from CSV based on class counts in Jettable column

In [None]:
input_file1 = input_file
input_file2 = "remaining_data.csv"

# Index of Jettable column
col_index = df.columns.get_loc('Jettable')

# Convert class_counts Series to a dictionary
class_counts_dict = class_counts_test.to_dict()

for class_label in class_counts_dict:
    class_counts_dict[class_label] = int(class_counts_dict[class_label])

# Multiple Drop
extract_and_save_rows(input_file1, 'Multiple Drop', col_index, class_counts_dict.get('Multiple Drop', 0))

# Single Drop
extract_and_save_rows(input_file2, "Single Drop", col_index, class_counts_dict.get("Single Drop", 0))

# No Ejection
extract_and_save_rows(input_file2, "No Ejection", col_index, class_counts_dict.get("No Ejection", 0))

# Combine the data files to respective training and testing files

In [None]:
df1 = pd.read_csv('Multiple Drop.csv')
df2 = pd.read_csv('Single Drop.csv')
df3 = pd.read_csv('No Ejection.csv')

frames = [df1,df2,df3]
df_test = pd.concat(frames)
frames=[df_test]
df_test=pd.concat(frames)

# Shuffle the data
df_test = shuffle(df_test)

df_test.to_csv('test_dataset.csv', index=False) # save combined testing data to a single csv file

# df_train = pd.read_csv('remaining_rows.csv')
# df_train.to_csv('combinedLabels_train.csv', index=False) # save training data to a csv file

renameFile('remaining_data.csv','train_dataset.csv')
removeFile(file_path='Single Drop.csv')
removeFile(file_path='No Ejection.csv')
removeFile(file_path='Multiple Drop.csv')