In [1]:
# Import dependencies
import sqlalchemy
from sqlalchemy import create_engine, inspect

import pandas as pd
import os

## Import datasets

In [2]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['aval',
 'bval',
 'demographic',
 'diagnosis',
 'gval',
 'habits',
 'rval',
 'spectrogram']

In [3]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in table_names:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

In [4]:
# Merge dataframes
merged_df = pd.merge(
    dataframes['demographic_df'],
    dataframes['diagnosis_df'],
    how = 'inner',
    on = 'id'
)

merged_df = pd.merge(
    merged_df,
    dataframes['habits_df'],
    how = 'inner',
    on = 'id'
)

# Display merged_df
merged_df.head()

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score,alcohol_consumption,alcohol_pd,...,chocolate,chocolate_grams_pd,coffee,coffee_pd,citrus_fruits,citrus_fruits_pd,soft_cheese,soft_cheese_pd,tomatoes,water_litres_pd
0,voice100,24,m,unknown,healthy,no subtype,0,5,casual,0.36,...,sometimes,30,always,3,never,0.0,almost always,100,never,1.5
1,voice101,60,m,unknown,healthy,no subtype,80,10,nondrinker,0.0,...,sometimes,30,always,4,never,0.0,sometimes,100,sometimes,1.5
2,voice192,22,m,cook,hyperkinetic dysphonia,no subtype,0,10,nondrinker,0.0,...,always,14,always,3,almost always,1.17,sometimes,100,sometimes,2.5
3,voice193,46,f,housewife,hyperkinetic dysphonia,no subtype,0,36,casual,0.36,...,sometimes,30,always,2,sometimes,1.0,sometimes,100,sometimes,1.0
4,voice008,51,f,researcher,reflux laryngitis,no subtype,19,15,casual,0.36,...,almost always,20,always,2,almost always,1.0,sometimes,100,almost always,1.0


## Visualisations

In [9]:
merged_df['rsi_score'].describe()

count    204.000000
mean      11.848039
std        8.281468
min        0.000000
25%        5.000000
50%       12.000000
75%       18.000000
max       41.000000
Name: rsi_score, dtype: float64