# Exploring Keyboard Data

1. Load the data
    - Create a dataframe of all data folders
    - additional features are Name of user, filename, Keyboard Material, microphone
2. Clean up where it's unknown
    - Impute values from online, or set generic value.


End result: One dataframe of information

Cluster data

Visualize those clusters

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import yaml

In [2]:
# Define the path to the data folder
data_folder = '..\data\\'

# Initialize an empty list to store the data
key_data_list = []
sentence_data_list = []

# Loop through all user folders in the data folder
for user_folder in os.listdir(data_folder):
    user_folder_path = os.path.join(data_folder, user_folder)
    if os.path.isdir(user_folder_path):  # Check if it's a directory
        key_folder_path = os.path.join(user_folder_path, 'data', 'key_data')
        sentence_folder_path = os.path.join(user_folder_path, 'data', 'sentence_data')

        # Key data, then sentence data
        # for internal_folder in [key_folder_path, sentence_folder_path]:
        for file_name in os.listdir(key_folder_path):
            # key_folder_path = os.path.join(data_folder, user_folder)
            # Check if the file is a YAML file
            if file_name.endswith('.yaml'):
                # Extract key_pressed value from the filename
                key_pressed = file_name.split('_')[2]
                
                # Construct the full path to the YAML file
                yaml_file_path = os.path.join(key_folder_path, file_name)
                
                # Read the YAML file and filter out the 'key_pressed' field and subsequent lines until 'keyboard_name'
                with open(yaml_file_path, 'r') as file:
                    lines = file.readlines()
                
                filtered_lines = []
                skip = False
                for line in lines:
                    if line.strip().startswith('key_pressed:'):
                        skip = True
                    if skip and line.strip().startswith('keyboard_name:'):
                        skip = False
                    if not skip:
                        filtered_lines.append(line)
                
                # Load the filtered lines into a dictionary
                yaml_data = yaml.safe_load(''.join(filtered_lines))
                
                # Add the user, audio file name, and key_pressed information
                yaml_data['user'] = user_folder
                yaml_data['audio_file'] = file_name.replace('.yaml', '.wav')  # Assuming audio files are in .wav format
                yaml_data['key_pressed'] = key_pressed
                
                # Append the data to the list
                key_data_list.append(yaml_data)

# Convert the list of dictionaries to a DataFrame
key_df = pd.DataFrame(key_data_list)
sentence_df = pd.DataFrame(sentence_data_list)

# Display the DataFrame
key_df.head()

Unnamed: 0,default_high_input_latency,default_high_output_latency,default_low_input_latency,default_low_output_latency,default_samplerate,hostapi,index,keyboard_name,keyboard_size,keyboard_type,max_input_channels,max_output_channels,name,switch_color,timestamp,user,audio_file,key_pressed
0,0.18,0.18,0.09,0.09,44100.0,0,1,DropCTRLV1,80%_Tenkeyless,mechanical,2,0,Mic/Inst (Samson G-Track Pro),Halo True,1720383986,Basem,"key_press_,_1720383986.wav",","
1,0.18,0.18,0.09,0.09,44100.0,0,1,DropCTRLV1,80%_Tenkeyless,mechanical,2,0,Mic/Inst (Samson G-Track Pro),Halo True,1720383989,Basem,"key_press_,_1720383989.wav",","
2,0.18,0.18,0.09,0.09,44100.0,0,1,DropCTRLV1,80%_Tenkeyless,mechanical,2,0,Mic/Inst (Samson G-Track Pro),Halo True,1720383993,Basem,"key_press_,_1720383993.wav",","
3,0.18,0.18,0.09,0.09,44100.0,0,1,DropCTRLV1,80%_Tenkeyless,mechanical,2,0,Mic/Inst (Samson G-Track Pro),Halo True,1720383996,Basem,"key_press_,_1720383996.wav",","
4,0.18,0.18,0.09,0.09,44100.0,0,1,DropCTRLV1,80%_Tenkeyless,mechanical,2,0,Mic/Inst (Samson G-Track Pro),Halo True,1720384000,Basem,"key_press_,_1720384000.wav",","


## Add Keyboard Material Column

In [3]:
key_df['keyboard_name'].unique()

material_mapper = {'DropCTRLV1': 'aluminum',
                   'Keychron Q3': 'aluminum'}

key_df['keyboard_material'] = key_df['keyboard_name'].map(material_mapper)
key_df.drop('keyboard_name', axis=1, inplace=True)

## Add Keyboard Microhone Column cleanup

In [4]:
key_df['name'].unique()

microphone_mapper = {'Mic/Inst (Samson G-Track Pro)': 'Samson_GTrack_Pro',
                     'Microphone (3- USB PnP Audio De': 'GenericPnP'}

key_df['microphone'] = key_df['name'].map(microphone_mapper)
key_df.drop('name', axis=1, inplace=True)

## Drop unneeded columns

In [5]:
unnecessary_cols = ['default_high_input_latency',
                    'default_high_output_latency',
                    'default_low_input_latency',
                    'default_low_output_latency',
                    'default_samplerate',
                    'hostapi',
                    'index',
                    'max_input_channels',
                    'max_output_channels',
                    'timestamp',]
key_df.drop(unnecessary_cols, axis=1, inplace=True)

In [6]:
key_df.head(5)

Unnamed: 0,keyboard_size,keyboard_type,switch_color,user,audio_file,key_pressed,keyboard_material,microphone
0,80%_Tenkeyless,mechanical,Halo True,Basem,"key_press_,_1720383986.wav",",",aluminum,Samson_GTrack_Pro
1,80%_Tenkeyless,mechanical,Halo True,Basem,"key_press_,_1720383989.wav",",",aluminum,Samson_GTrack_Pro
2,80%_Tenkeyless,mechanical,Halo True,Basem,"key_press_,_1720383993.wav",",",aluminum,Samson_GTrack_Pro
3,80%_Tenkeyless,mechanical,Halo True,Basem,"key_press_,_1720383996.wav",",",aluminum,Samson_GTrack_Pro
4,80%_Tenkeyless,mechanical,Halo True,Basem,"key_press_,_1720384000.wav",",",aluminum,Samson_GTrack_Pro
