# This file is used for the gender, age, site based visualizations and results

<h4 style="color:blue">Note that, to use these functions, you need to use or import the "MyAudioDataset()" function created in the visualization file, once you pass a dataset in the MyAudioDataset function, it will return appropriate dataset format to be used in the below, the count functions return pandas dataframe with count based on each category </h4>

In [None]:
import numpy as np
import pandas as pd

### get site based count

In [None]:
def get_SiteCount(dataset):
    
    n = len(dataset)
    site_names = {0: 'Mt. Sinai', 1: 'VUMC', 2: 'MIT', 3: 'USF', 4: 'WCM'}
    vcp_counts = {site: {'VCP': 0, 'NO_VCP': 0} for site in site_names.values()}
    gender_counts = {site: {'Male': 0, 'Female': 0, 'No Record': 0, 'Non Binary': 0} for site in site_names.values()}
    
    for i in range(n):
        try:
            opensmile_feature, age, gender, site, binned_age, vfp = dataset.__getitem__(i)
        except ValueError:
            print(f"Error: dataset.__getitem__({i}) did not return 5 values")
            continue
        
        if site in site_names:
            
            #count VCP per site
            site_name = site_names[site]
            if vfp == 0.0:
                vcp_counts[site_name]['NO_VCP'] += 1
            else:
                vcp_counts[site_name]['VCP'] += 1
                
            #count gender per site
            if gender == 0:
                gender_counts[site_name]['Male'] += 1
            elif gender == 1:
                gender_counts[site_name]['Female'] += 1
            elif gender == 2:
                gender_counts[site_name]['No Record'] += 1
            else:
                gender_counts[site_name]['Non Binary'] += 1
            
    
    vfp_data = {
        "Total": [vcp_counts['Mt. Sinai']['VCP'] + vcp_counts['Mt. Sinai']['NO_VCP'], vcp_counts['VUMC']['VCP'] + vcp_counts['VUMC']['NO_VCP'], vcp_counts['MIT']['VCP'] + vcp_counts['MIT']['NO_VCP'], vcp_counts['USF']['VCP'] + vcp_counts['USF']['NO_VCP'], vcp_counts['WCM']['VCP'] + vcp_counts['WCM']['NO_VCP']],
        "VFP": [vcp_counts['Mt. Sinai']['VCP'], vcp_counts['VUMC']['VCP'], vcp_counts['MIT']['VCP'], vcp_counts['USF']['VCP'], vcp_counts['WCM']['VCP']],
        "NO VFP": [vcp_counts['Mt. Sinai']['NO_VCP'], vcp_counts['VUMC']['NO_VCP'], vcp_counts['MIT']['NO_VCP'], vcp_counts['USF']['NO_VCP'], vcp_counts['WCM']['NO_VCP']]
    }
    
    vfp_row_names = ["Mt. Sinai", "VUMC", "MIT", "USF", "WCM"]
    
    vfp_df = pd.DataFrame(vfp_data, index = vfp_row_names)

    gender_data = {
    "Total": [gender_counts['Mt. Sinai']['Male'] + gender_counts['Mt. Sinai']['Female'] + gender_counts['Mt. Sinai']['No Record'] + gender_counts['Mt. Sinai']['Non Binary'], gender_counts['VUMC']['Male'] + gender_counts['VUMC']['Female'] + gender_counts['VUMC']['No Record'] + gender_counts['VUMC']['Non Binary'], gender_counts['MIT']['Male'] + gender_counts['MIT']['Female'] + gender_counts['MIT']['No Record'] + gender_counts['MIT']['Non Binary'], gender_counts['USF']['Male'] + gender_counts['USF']['Female'] + gender_counts['USF']['No Record'] + gender_counts['USF']['Non Binary'], gender_counts['WCM']['Male'] + gender_counts['WCM']['Female'] + gender_counts['WCM']['No Record'] + gender_counts['WCM']['Non Binary']],
    "Male": [gender_counts['Mt. Sinai']['Male'], gender_counts['VUMC']['Male'], gender_counts['MIT']['Male'], gender_counts['USF']['Male'], gender_counts['WCM']['Male']],
    "Female": [gender_counts['Mt. Sinai']['Female'], gender_counts['VUMC']['Female'], gender_counts['MIT']['Female'], gender_counts['USF']['Female'], gender_counts['WCM']['Female']],
    "No Record": [gender_counts['Mt. Sinai']['No Record'], gender_counts['VUMC']['No Record'], gender_counts['MIT']['No Record'], gender_counts['USF']['No Record'], gender_counts['WCM']['No Record']],
    "Non Binary": [gender_counts['Mt. Sinai']['Non Binary'], gender_counts['VUMC']['Non Binary'], gender_counts['MIT']['Non Binary'], gender_counts['USF']['Non Binary'], gender_counts['WCM']['Non Binary']]
    }
    
    gender_row_names = ["Mt. Sinai", "VUMC", "MIT", "USF", "WCM"]
    
    gender_df = pd.DataFrame(gender_data, index = gender_row_names)

    
    return vfp_df, gender_df   # returns vocal fold paralysis and gender count per site in two different pandas dataframe

### get gender based count

In [None]:
def get_GenderCount(dataset):
    
    n = len(dataset)
    gender_names = {0: 'Male', 1: 'Female', 2: 'No Record', 3: 'Non Binary'}
    vcp_counts = {gender: {'VCP': 0, 'NO_VCP': 0} for gender in gender_names.values()}
    # gender_counts = {site: {'Male': 0, 'Female': 0, 'No Record': 0, 'Non Binary': 0} for site in site_names.values()}
    
    for i in range(n):
        try:
            opensmile_feature, age, gender, site, binned_age, vfp = dataset.__getitem__(i)
        except ValueError:
            print(f"Error: dataset.__getitem__({i}) did not return 5 values")
            continue
        
        if gender in gender_names:
            
            #count VCP per gender
            gender_name = gender_names[gender]
            if vfp == 0.0:
                vcp_counts[gender_name]['NO_VCP'] += 1
            else:
                vcp_counts[gender_name]['VCP'] += 1

    vfp_data = {
        "Total": [vcp_counts['Male']['VCP'] + vcp_counts['Male']['NO_VCP'], vcp_counts['Female']['VCP'] + vcp_counts['Female']['NO_VCP'], vcp_counts['No Record']['VCP'] + vcp_counts['No Record']['NO_VCP'], vcp_counts['Non Binary']['VCP'] + vcp_counts['Non Binary']['NO_VCP']],
        "VFP": [vcp_counts['Male']['VCP'], vcp_counts['Female']['VCP'], vcp_counts['No Record']['VCP'], vcp_counts['Non Binary']['VCP']],
        "NO VFP": [vcp_counts['Male']['NO_VCP'], vcp_counts['Female']['NO_VCP'], vcp_counts['No Record']['NO_VCP'], vcp_counts['Non Binary']['NO_VCP']]
    }

    vfp_row_names = ["Male", "Female", "No Record", "Non Binary"]
    
    vfp_df = pd.DataFrame(vfp_data, index = vfp_row_names)

    return vfp_df

### get age based count

In [None]:
def get_AgeCount(dataset):
    
    n = len(dataset)
    age_names = {0: '0-20', 1: '21-40', 2: '41-60', 3: '61-80', 4: '81-100'}
    vcp_counts = {age: {'VCP': 0, 'NO_VCP': 0} for age in age_names.values()}
    # gender_counts = {site: {'Male': 0, 'Female': 0, 'No Record': 0, 'Non Binary': 0} for site in site_names.values()}
    
    for i in range(n):
        try:
            opensmile_feature, age, gender, site, binned_age, vfp = dataset.__getitem__(i)
        except ValueError:
            print(f"Error: dataset.__getitem__({i}) did not return 5 values")
            continue
        
        if binned_age in age_names:
            
            #count VCP per gender
            age_name = age_names[binned_age]
            if vfp == 0.0:
                vcp_counts[age_name]['NO_VCP'] += 1
            else:
                vcp_counts[age_name]['VCP'] += 1

    vfp_data = {
        "Total": [vcp_counts['0-20']['VCP'] + vcp_counts['0-20']['NO_VCP'], vcp_counts['21-40']['VCP'] + vcp_counts['21-40']['NO_VCP'], vcp_counts['41-60']['VCP'] + vcp_counts['41-60']['NO_VCP'], vcp_counts['61-80']['VCP'] + vcp_counts['61-80']['NO_VCP'], vcp_counts['81-100']['VCP'] + vcp_counts['81-100']['NO_VCP']],
        "VFP": [vcp_counts['0-20']['VCP'], vcp_counts['21-40']['VCP'], vcp_counts['41-60']['VCP'], vcp_counts['61-80']['VCP'], vcp_counts['81-100']['VCP']],
        "NO VFP": [vcp_counts['0-20']['NO_VCP'], vcp_counts['21-40']['NO_VCP'], vcp_counts['41-60']['NO_VCP'], vcp_counts['61-80']['NO_VCP'],vcp_counts['81-100']['NO_VCP']]
    }

    vfp_row_names = ["0-20", "21-40", "41-60", "61-80", "81-100"]
    
    vfp_df = pd.DataFrame(vfp_data, index = vfp_row_names)

    return vfp_df

<h4 style="color:blue">Note that, to use these functions, you need to use or import the "create_open_smile_df()" function created in the visualization file, it returns 3 variables, the first one of "the opensmile_df_test" which is all the opensmile features needed to test the RF_classifier() function created in the visualization folder </h4>

<h2>Prediction of Random forest classifier based on site, age, gender</h2>

#### create site, gender, age based datasets from the returned dataset from "create_open-smile_df"

In [4]:
# site based mapping
site_mapping = {0: 'Mt. Sinai', 1: 'VUMC', 2: 'MIT', 3: 'USF', 4: 'WCM'}

# Divide the DataFrame into separate DataFrames based on 'site'
site_dfs = {site_mapping[site]: df for site, df in opensmile_df_test.groupby('site')}

# Access the DataFrames
mt_sinai_df = site_dfs['Mt. Sinai']
vumc_df = site_dfs['VUMC']
mit_df = site_dfs['MIT']
usf_df = site_dfs['USF']
wcm_df = site_dfs['WCM']

In [None]:
# gender based
gender_mapping = {0: 'Male', 1: 'Female', 2: 'No Record', 3: 'Non Binary'}

# Divide the DataFrame into separate DataFrames based on 'site'
gender_dfs = {gender_mapping[site]: df for site, df in opensmile_df_test.groupby('gender')}

# Access the DataFrames
male_df = gender_dfs['Male']
female_df = gender_dfs['Female']
# no_record_df = gender_dfs['No Record']
# non_binary_df = gender_dfs['Non Binary']

In [None]:
# age based

age_mapping = {0: '0-20', 1: '21-40', 2: '41-60', 3: '61-80', 4: '81-100'}

# Divide the DataFrame into separate DataFrames based on 'site'
age_dfs = {age_mapping[site]: df for site, df in opensmile_df_test.groupby('AGE_bin')}

# Access the DataFrames
age_0_20_df = age_dfs['0-20']
age_21_40_df = age_dfs['21-40']
age_41_60_df = age_dfs['41-60']
age_61_80_df = age_dfs['61-80']
age_81_100_df = age_dfs['81-100']

#### this is the function that shows the accuracy score of the GAS based prediction, it should be used under the main visualization code, not alone

In [None]:
def GAS_based_prediction(df):

    df_X_test = scalar.transform(df[feature_columns])
    df_y_test = df['vocal_fold_paralysis']
    df_predictions = RF_clf.predict(df_X_test)
    
    accuracy = accuracy_score(df_y_test, df_predictions)
    return accuracy