# Substitute Speaker given a Key-Value Table

## Preparation

### Import

In [1]:
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Function Definition

In [2]:
### Future Improvement check every document to create the table first

### Substitute each Speaker with his StudentID

In [3]:
def substitute_values_in_files(input_folder_path, excel_path, output_folder_path):
    # Load the Excel file into a Pandas dataframe
    df = pd.read_excel(excel_path)

    # Create an empty dictionary
    my_dict = {}

    # Iterate over the rows of the DataFrame
    for i in range(df.shape[0]):
        # Get the values of the row
        values = df.iloc[i].values
        # Create the dictionary key
        key = f"{values[0]}_dia_trans.csv"
        sub_dict_key = str(values[1])
        sub_dict_value = str(values[2])
        # If the key already exists in the dictionary, append the sub-dictionary to it
        if key in my_dict:
            my_dict[key][sub_dict_key] = sub_dict_value
        # Otherwise, create a new sub-dictionary with the current key-value pair
        else:
            my_dict[key] = {sub_dict_key: sub_dict_value}

    # Initialize a counter
    counter = 1

    # Loop through all the files in the input folder
    for filename in os.listdir(input_folder_path):
        if filename in my_dict:
            # Open the input file
            with open(os.path.join(input_folder_path, filename), 'r') as f:
                file_contents = f.read()
            # Substitute the value in the file
            for k, v in my_dict[filename].items():
                file_contents = file_contents.replace(k, v)
            # Save the modified file in the output folder
            output_file_path = os.path.join(output_folder_path, str(filename[:-4] + "_users.csv"))
            with open(output_file_path, 'w') as f:
                f.write(file_contents)
            # Print the name of the processed file
            print(f"Processed file {counter}: {filename}")
            counter += 1

In [4]:
def join_csv_files(directory, output_filename):
    # Get a list of all CSV files in the specified directory
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

    # Initialize an empty list to store DataFrames
    dfs = []

    # Read and concatenate each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        dfs.append(df)

    # Concatenate all DataFrames into one
    concatenated_df = pd.concat(dfs, ignore_index=True)

    # Save the concatenated DataFrame to a CSV file with the specified name
    output_path = os.path.join(directory, output_filename)
    concatenated_df.to_csv(output_path, index=False)

## Use of Function

In [5]:
input_folder_path = r'/content/drive/MyDrive/Projects/tps/meetings/data/7. diarization_transcription'
excel_path = r'/content/drive/MyDrive/Projects/tps/meetings/data/0. support_files/speaker_match.xlsx'
output_folder_path = r'/content/drive/MyDrive/Projects/tps/meetings/data/8. dia_trans_verified_users'

In [6]:
substitute_values_in_files(input_folder_path, excel_path, output_folder_path)

Processed file 1: 1.1_dia_trans.csv
Processed file 2: 12.1_dia_trans.csv
Processed file 3: 10.2_dia_trans.csv
Processed file 4: 1.2_dia_trans.csv
Processed file 5: 2.2_dia_trans.csv
Processed file 6: 4.1_dia_trans.csv
Processed file 7: 10.1_dia_trans.csv
Processed file 8: 2.1_dia_trans.csv
Processed file 9: 11.1_dia_trans.csv
Processed file 10: 11.2_dia_trans.csv
Processed file 11: 12.2_dia_trans.csv
Processed file 12: 4.2_dia_trans.csv
Processed file 13: 6.1_dia_trans.csv
Processed file 14: 5.2_dia_trans.csv
Processed file 15: 7.1_dia_trans.csv
Processed file 16: 4.3_dia_trans.csv
Processed file 17: 7.2_dia_trans.csv
Processed file 18: 5.1_dia_trans.csv
Processed file 19: 7.4_dia_trans.csv
Processed file 20: 7.5_dia_trans.csv
Processed file 21: 9.2_dia_trans.csv
Processed file 22: 9.1_dia_trans.csv
Processed file 23: 3.1_dia_trans.csv
Processed file 24: 3.3_dia_trans.csv
Processed file 25: 7.3_dia_trans.csv
Processed file 26: 8.2_dia_trans.csv
Processed file 27: 8.1_dia_trans.csv
Proc

In [7]:
output_file_name = 'dia_trans_combined.csv'
join_csv_files(output_folder_path, output_file_name)