In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import sqlalchemy
from sqlalchemy import create_engine, inspect

import pandas as pd
import tensorflow as tf

## Import datasets

In [2]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['demographic', 'diagnosis', 'habits']

In [3]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in table_names:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

In [13]:
dataframes['demographic_df']['id'].value_counts()

id
voice005    2
voice055    2
voice100    1
voice137    1
voice140    1
           ..
voice080    1
voice081    1
voice153    1
voice152    1
voice024    1
Name: count, Length: 206, dtype: int64

In [5]:
dataframes['diagnosis_df']

Unnamed: 0,id,diagnosis,subtype,vhi_score,rsi_score
0,voice100,healthy,no subtype,0,5
1,voice101,healthy,no subtype,80,10
2,voice192,hyperkinetic dysphonia,no subtype,0,10
3,voice193,hyperkinetic dysphonia,no subtype,0,36
4,voice008,reflux laryngitis,no subtype,19,15
...,...,...,...,...,...
203,voice180,healthy,no subtype,0,3
204,voice164,hyperkinetic dysphonia,no subtype,34,1
205,voice165,hypokinetic dysphonia,glottic insufficiency,97,16
206,voice025,healthy,no subtype,16,1


In [6]:
dataframes['habits_df']

Unnamed: 0,id,alcohol_consumption,alcohol_pd,smoker,cigarettes_pd,carbonated_beverages,carbonated_pd,chocolate,chocolate_grams_pd,coffee,coffee_pd,citrus_fruits,citrus_fruits_pd,soft_cheese,soft_cheese_pd,tomatoes,water_litres_pd
0,voice100,casual,0.36,no,0,almost always,3.00,sometimes,30,always,3,never,0.00,almost always,100,never,1.5
1,voice101,nondrinker,0.00,no,0,almost always,3.00,sometimes,30,always,4,never,0.00,sometimes,115,sometimes,1.5
2,voice192,nondrinker,0.00,no,0,never,0.00,always,14,always,3,almost always,1.00,sometimes,115,sometimes,2.5
3,voice193,casual,0.36,yes,15,sometimes,0.61,sometimes,30,always,2,sometimes,1.00,sometimes,115,sometimes,1.0
4,voice008,casual,0.36,no,0,almost never,0.09,almost always,20,always,2,almost always,1.00,sometimes,100,almost always,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,voice180,casual,0.36,no,0,never,0.00,sometimes,30,never,0,never,0.00,sometimes,115,sometimes,1.5
204,voice164,casual,0.36,no,0,sometimes,0.61,sometimes,30,sometimes,2,sometimes,1.00,sometimes,115,sometimes,1.0
205,voice165,casual,0.36,no,0,sometimes,0.61,sometimes,30,sometimes,2,never,0.00,almost never,100,sometimes,1.5
206,voice025,habitual,1.00,no,0,almost never,0.09,sometimes,10,always,1,almost always,1.00,sometimes,5,sometimes,1.5


In [10]:
print(dataframes['demographic_df']['id'].dtype)
print(dataframes['diagnosis_df']['id'].dtype)

object
object


In [12]:
print(dataframes['demographic_df']['id'].value_counts())
print(dataframes['diagnosis_df']['id'].dtype)

id
voice005    2
voice055    2
voice100    1
voice137    1
voice140    1
           ..
voice080    1
voice081    1
voice153    1
voice152    1
voice024    1
Name: count, Length: 206, dtype: int64
object


In [7]:
# Merge dataframes
merged_df = pd.merge(
    dataframes['demographic_df'],
    dataframes['diagnosis_df'],
    how = 'left',
    on = 'id'
)

# merged_df = pd.merge(
#     merged_df,
#     dataframes['habits_df'],
#     # how = 'inner',
#     on = 'id'
# )

# Display merged_df
merged_df

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score
0,voice100,24,m,unknown,healthy,no subtype,0,5
1,voice101,60,m,unknown,healthy,no subtype,80,10
2,voice192,22,m,cook,hyperkinetic dysphonia,no subtype,0,10
3,voice193,46,f,housewife,hyperkinetic dysphonia,no subtype,0,36
4,voice008,51,f,researcher,reflux laryngitis,no subtype,19,15
...,...,...,...,...,...,...,...,...
207,voice180,26,f,student,healthy,no subtype,0,3
208,voice164,65,f,pensioner,hyperkinetic dysphonia,no subtype,34,1
209,voice165,55,m,hairstylist,hypokinetic dysphonia,glottic insufficiency,97,16
210,voice025,45,m,researcher,healthy,no subtype,16,1


In [8]:
merged_df['id'].value_counts(dropna=False)

id
voice005    4
voice055    4
voice100    1
voice137    1
voice140    1
           ..
voice080    1
voice081    1
voice153    1
voice152    1
voice024    1
Name: count, Length: 206, dtype: int64

In [9]:
merged_df.loc[merged_df['id'] == 'voice005']

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score
99,voice005,54,f,researcher,hypokinetic dysphonia,no subtype,39,23
100,voice005,54,f,researcher,hypokinetic dysphonia,no subtype,39,25
163,voice005,54,f,researcher,hypokinetic dysphonia,no subtype,39,23
164,voice005,54,f,researcher,hypokinetic dysphonia,no subtype,39,25
