# Data Cleaning

In [40]:
# Import dependencies
import os
import pandas as pd
import numpy as np

## Read in Files

In [5]:
# Raw dataset file path
rawdata_path = "resources/voiced_dataset/"

# Get all the files in the directory
files = os.listdir(rawdata_path)

In [61]:
# Initialise the list to hold dictionaries
metadata_list = []

# Parse the data in the info files
for file in files:
    
    # Initialise the dictionary to store the info
    metadata_dict = dict()
    
    # Look through info files only
    if file.startswith("voice") and file.endswith("-info.txt"):
        
        # Read the text file
        with open(rawdata_path + file, 'r') as file:
            for line in file:
                
                # Split each line into a key-value pair using delimiter
                key, value = map(str.strip, line.split("\t"))
                
                # Ignore the empty lines by checking whitespaces
                if not line.strip():
                    continue
                else:
                    # Remove the colon
                    key = key.replace(":", "")
                    
                    # Load the data to a dictionary
                    metadata_dict[key] = value
                    
            # Append the dictionary to the list
            metadata_list.append(metadata_dict)

# Convert the list to a DataFrame
metadata_df = pd.DataFrame(metadata_list)
metadata_df.head()

Unnamed: 0,ID,Age,Gender,Diagnosis,Occupation status,Voice Handicap Index (VHI) Score,Reflux Symptom Index (RSI) Score,Smoker,Number of cigarettes smoked per day,Alcohol consumption,...,Amount of glasses drinked in a day,Tomatoes,Coffee,Number of cups of coffee drinked in a day,Chocolate,Gramme of chocolate eaten in a day,Soft cheese,Gramme of soft cheese eaten in a day,Citrus fruits,Number of citrus fruits eaten in a day
0,voice100,24,m,healthy,NU,0,5,no,NU,casual drinker,...,NU,never,always,3,sometimes,NU,almost always,NU,never,NU
1,voice101,60,m,healthy,NU,80,10,no,NU,nondrinker,...,NU,sometimes,always,4,sometimes,NU,sometimes,NU,never,NU
2,voice192,22,m,hyperkinetic dysphonia,Cook,0,10,no,NU,nondrinker,...,NU,sometimes,always,NU,always,NU,sometimes,NU,almost always,NU
3,voice193,46,f,hyperkinetic dysphonia,Housewife,0,36,yes,15,casual drinker,...,NU,sometimes,always,2,sometimes,NU,sometimes,NU,sometimes,NU
4,voice008,51,f,reflux laryngitis,Researcher,19,15,no,NU,casual drinker,...,NU,almost always,always,2,almost always,20g,sometimes,100 gr,almost always,1


## Clean metadata_df

In [62]:
# Check the dataframe
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 24 columns):
 #   Column                                                            Non-Null Count  Dtype 
---  ------                                                            --------------  ----- 
 0   ID                                                                208 non-null    object
 1   Age                                                               208 non-null    object
 2   Gender                                                            208 non-null    object
 3   Diagnosis                                                         208 non-null    object
 4   Occupation status                                                 208 non-null    object
 5   Voice Handicap Index (VHI) Score                                  208 non-null    object
 6   Reflux Symptom Index (RSI) Score                                  208 non-null    object
 7   Smoker                                      

### Simplify the column names

In [63]:
# Print the columns
original_cols = list(metadata_df.columns)
original_cols

['ID',
 'Age',
 'Gender',
 'Diagnosis',
 'Occupation status',
 'Voice Handicap Index (VHI) Score',
 'Reflux Symptom Index (RSI) Score',
 'Smoker',
 'Number of cigarettes smoked per day',
 'Alcohol consumption',
 'Number of glasses containing alcoholic beverage drinked in a day',
 "Amount of water's litres drink every day",
 'Eating habits',
 'Carbonated beverages',
 'Amount of glasses drinked in a day',
 'Tomatoes',
 'Coffee',
 'Number of cups of coffee drinked in a day',
 'Chocolate',
 'Gramme of chocolate eaten in  a day',
 'Soft cheese',
 'Gramme of soft cheese eaten in a day',
 'Citrus fruits',
 'Number of citrus fruits eaten in a day']

In [64]:
# Convert the column names to lower case
lowercase_cols = [col_name.lower() for col_name in original_cols]

# Create a copy of metadata_df
renamed_df = metadata_df.copy()

# Update the column names
renamed_df.columns = lowercase_cols

# Display the DataFrame
renamed_df.head()

Unnamed: 0,id,age,gender,diagnosis,occupation status,voice handicap index (vhi) score,reflux symptom index (rsi) score,smoker,number of cigarettes smoked per day,alcohol consumption,...,amount of glasses drinked in a day,tomatoes,coffee,number of cups of coffee drinked in a day,chocolate,gramme of chocolate eaten in a day,soft cheese,gramme of soft cheese eaten in a day,citrus fruits,number of citrus fruits eaten in a day
0,voice100,24,m,healthy,NU,0,5,no,NU,casual drinker,...,NU,never,always,3,sometimes,NU,almost always,NU,never,NU
1,voice101,60,m,healthy,NU,80,10,no,NU,nondrinker,...,NU,sometimes,always,4,sometimes,NU,sometimes,NU,never,NU
2,voice192,22,m,hyperkinetic dysphonia,Cook,0,10,no,NU,nondrinker,...,NU,sometimes,always,NU,always,NU,sometimes,NU,almost always,NU
3,voice193,46,f,hyperkinetic dysphonia,Housewife,0,36,yes,15,casual drinker,...,NU,sometimes,always,2,sometimes,NU,sometimes,NU,sometimes,NU
4,voice008,51,f,reflux laryngitis,Researcher,19,15,no,NU,casual drinker,...,NU,almost always,always,2,almost always,20g,sometimes,100 gr,almost always,1


In [65]:
# Print updated columns
list(renamed_df.columns)

['id',
 'age',
 'gender',
 'diagnosis',
 'occupation status',
 'voice handicap index (vhi) score',
 'reflux symptom index (rsi) score',
 'smoker',
 'number of cigarettes smoked per day',
 'alcohol consumption',
 'number of glasses containing alcoholic beverage drinked in a day',
 "amount of water's litres drink every day",
 'eating habits',
 'carbonated beverages',
 'amount of glasses drinked in a day',
 'tomatoes',
 'coffee',
 'number of cups of coffee drinked in a day',
 'chocolate',
 'gramme of chocolate eaten in  a day',
 'soft cheese',
 'gramme of soft cheese eaten in a day',
 'citrus fruits',
 'number of citrus fruits eaten in a day']

In [66]:
# Simplify the column names
renamed_df = renamed_df.rename(columns={
    'voice handicap index (vhi) score': 'vhi score',
    'reflux symptom index (rsi) score': 'rsi score',
    'number of cigarettes smoked per day': 'cigarettes per day',
    'number of glasses containing alcoholic beverage drinked in a day': 'alcohol per day',
    "amount of water's litres drink every day": 'water litres per day',
    'amount of glasses drinked in a day': 'carbonated per day',
    'number of cups of coffee drinked in a day': 'coffee per day',
    'gramme of chocolate eaten in  a day': 'chocolate grams per day',
    'gramme of soft cheese eaten in a day': 'soft cheese per day',
    'number of citrus fruits eaten in a day': 'citrus fruits per day'
})

# Display the updated DataFrame
renamed_df.head()

Unnamed: 0,id,age,gender,diagnosis,occupation status,vhi score,rsi score,smoker,cigarettes per day,alcohol consumption,...,carbonated per day,tomatoes,coffee,coffee per day,chocolate,chocolate grams per day,soft cheese,soft cheese per day,citrus fruits,citrus fruits per day
0,voice100,24,m,healthy,NU,0,5,no,NU,casual drinker,...,NU,never,always,3,sometimes,NU,almost always,NU,never,NU
1,voice101,60,m,healthy,NU,80,10,no,NU,nondrinker,...,NU,sometimes,always,4,sometimes,NU,sometimes,NU,never,NU
2,voice192,22,m,hyperkinetic dysphonia,Cook,0,10,no,NU,nondrinker,...,NU,sometimes,always,NU,always,NU,sometimes,NU,almost always,NU
3,voice193,46,f,hyperkinetic dysphonia,Housewife,0,36,yes,15,casual drinker,...,NU,sometimes,always,2,sometimes,NU,sometimes,NU,sometimes,NU
4,voice008,51,f,reflux laryngitis,Researcher,19,15,no,NU,casual drinker,...,NU,almost always,always,2,almost always,20g,sometimes,100 gr,almost always,1


In [67]:
# Convert 'per day' to pd
renamed_cols = list(renamed_df.columns)

updated_cols = []
# Update each column
for col in renamed_cols:
    col = col.replace("per day", "pd")
    col = col.replace(" ", "_")
    updated_cols.append(col)

# Set the updated columns
renamed_df.columns = updated_cols

# Display the updated DataFrame
renamed_df.head()

Unnamed: 0,id,age,gender,diagnosis,occupation_status,vhi_score,rsi_score,smoker,cigarettes_pd,alcohol_consumption,...,carbonated_pd,tomatoes,coffee,coffee_pd,chocolate,chocolate_grams_pd,soft_cheese,soft_cheese_pd,citrus_fruits,citrus_fruits_pd
0,voice100,24,m,healthy,NU,0,5,no,NU,casual drinker,...,NU,never,always,3,sometimes,NU,almost always,NU,never,NU
1,voice101,60,m,healthy,NU,80,10,no,NU,nondrinker,...,NU,sometimes,always,4,sometimes,NU,sometimes,NU,never,NU
2,voice192,22,m,hyperkinetic dysphonia,Cook,0,10,no,NU,nondrinker,...,NU,sometimes,always,NU,always,NU,sometimes,NU,almost always,NU
3,voice193,46,f,hyperkinetic dysphonia,Housewife,0,36,yes,15,casual drinker,...,NU,sometimes,always,2,sometimes,NU,sometimes,NU,sometimes,NU
4,voice008,51,f,reflux laryngitis,Researcher,19,15,no,NU,casual drinker,...,NU,almost always,always,2,almost always,20g,sometimes,100 gr,almost always,1


### Convert 'NU' values to 'NaN'

In [68]:
# Create a copy of renamed
updated_df = renamed_df.copy()

# Convert the 'NU' values to 'NaN'
updated_df = updated_df.replace("NU", np.nan)

In [69]:
# Check the updated DataFrame info
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    208 non-null    object
 1   age                   208 non-null    object
 2   gender                208 non-null    object
 3   diagnosis             208 non-null    object
 4   occupation_status     167 non-null    object
 5   vhi_score             208 non-null    object
 6   rsi_score             208 non-null    object
 7   smoker                208 non-null    object
 8   cigarettes_pd         41 non-null     object
 9   alcohol_consumption   208 non-null    object
 10  alcohol_pd            25 non-null     object
 11  water_litres_pd       208 non-null    object
 12  eating_habits         208 non-null    object
 13  carbonated_beverages  208 non-null    object
 14  carbonated_pd         21 non-null     object
 15  tomatoes              208 non-null    ob

In [70]:
# Display the columns with null values
cols_with_null = updated_df.columns[updated_df.isnull().any()]

# Return as a DataFrame
updated_nulls_df = updated_df[cols_with_null]
updated_nulls_df.head()

Unnamed: 0,occupation_status,cigarettes_pd,alcohol_pd,carbonated_pd,coffee_pd,chocolate_grams_pd,soft_cheese_pd,citrus_fruits_pd
0,,,,,3.0,,,
1,,,,,4.0,,,
2,Cook,,,,,,,
3,Housewife,15.0,,,2.0,,,
4,Researcher,,,,2.0,20g,100 gr,1.0


### Clean columns with binary value

In [71]:
# Check the gender column
updated_df['gender'].value_counts()

gender
f    136
m     72
Name: count, dtype: int64

In [72]:
# Check the smoker column
updated_df['smoker'].value_counts()

smoker
no               98
No               56
yes              43
casual smoker    11
Name: count, dtype: int64

In [73]:
# Convert the 'No' to 'no'
updated_df['smoker'] = updated_df['smoker'].str.replace('No', 'no')

# Convert 'casual smoker' to 'casual'
updated_df['smoker'] = updated_df['smoker'].str.replace('casual smoker', 'casual')

# Display the updated values
updated_df['smoker'].value_counts()

smoker
no        154
yes        43
casual     11
Name: count, dtype: int64

In [74]:
# Check the alcohol_consumption column
updated_df['alcohol_consumption'].value_counts()

alcohol_consumption
casual drinker      101
nondrinker           84
habitual drinker     23
Name: count, dtype: int64

In [75]:
# Check the eating_habits column
updated_df['eating_habits']

# Drop 

0       
1       
2       
3       
4       
      ..
203     
204     
205     
206     
207     
Name: eating_habits, Length: 208, dtype: object

In [55]:
# Check the `smoker` and `cigarettes_pd` columns
updated_df[['smoker', 'cigarettes_pd']]

Unnamed: 0,smoker,cigarettes_pd
0,no,
1,no,
2,no,
3,yes,15
4,no,
...,...,...
203,no,
204,no,
205,no,
206,no,
