# Data Analysis Notebook

This notebook presents various visualizations of the "voice.csv" dataset to better understand the relationships between variables and the context of the study.

# Dependencies & Data Import

In [1]:
import pandas as pd
from scipy.stats import pointbiserialr
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots



In [2]:
file_path = "data/voice.csv"
data = pd.read_csv(file_path)

data['label'] = data['label'].map({'male': 1, 'female': 0})


# Data Description

## Variables Description

In [34]:
features = data.drop(columns=['label'])

stats_df = pd.DataFrame({
    'Var_Name': features.columns,
    'Mean': features.mean(),
    'Standard_Deviation': features.std()
}).reset_index(drop=True)

stats_df['Description'] = ["mean frequency (in kHz)", "standard deviation of frequency", "median frequency (in kHz)", "first quantile (in kHz)", "third quantile (in kHz)", "interquantile range (in kHz)", "skewness", "kurtosis", "spectral entropy", "spectral flatness", "mode frequency", "frequency centroid", "average of fundamental frequency", "minimum fundamental frequency", "maximum fundamental frequency", "average of dominant frequency", "minimum of dominant frequency", "maximum of dominant frequency", "range of dominant frequency","modulation index" ]
stats_df


Unnamed: 0,Var_Name,Mean,Standard_Deviation,Description
0,meanfreq,0.180907,0.029918,mean frequency (in kHz)
1,sd,0.057126,0.016652,standard deviation of frequency
2,median,0.185621,0.03636,median frequency (in kHz)
3,Q25,0.140456,0.04868,first quantile (in kHz)
4,Q75,0.224765,0.023639,third quantile (in kHz)
5,IQR,0.084309,0.042783,interquantile range (in kHz)
6,skew,3.140168,4.240529,skewness
7,kurt,36.568461,134.928661,kurtosis
8,sp.ent,0.895127,0.04498,spectral entropy
9,sfm,0.408216,0.177521,spectral flatness


## Correlation & P-values

In [58]:

# Calculate the correlation matrix for all numerical columns
correlation_matrix = data[numerical_columns].corr()

# Reverse the order of the rows and columns to make the diagonal go from top-right to bottom-left
correlation_matrix = correlation_matrix.iloc[::-1, ::]

# Create the Heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='magenta',
    colorbar=dict(title="Correlation")
))

# Add value of the correlation values
annotations = []
for i in range(len(correlation_matrix)):
    for j in range(len(correlation_matrix.columns)):
        annotations.append(
            go.layout.Annotation(
                x=j,
                y=i,
                text=f"{correlation_matrix.iloc[i, j]:.2f}",  # Round correlation values to 2 decimal places
                showarrow=False,
                font=dict(color="black"),
                align="center"
            )
        )

# Final adjusments (height, annotations, ...)
fig_heatmap.update_layout(
    annotations=annotations,
    title="Correlation Matrix Heatmap",
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickangle=45),
    yaxis=dict(tickangle=45),
    height=900
)

# Show the heatmap
fig_heatmap.show()

In [36]:
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

correlations = {}
for col in numerical_columns:
    if col != 'label':
        corr, p_value = pointbiserialr(data['label'], data[col])
        correlations[col] = {'correlation': corr, 'p_value': p_value}

correlation_df = pd.DataFrame(correlations).T
correlation_df

Unnamed: 0,correlation,p_value
meanfreq,-0.337415,3.368951e-85
sd,0.479539,6.654756e-182
median,-0.283919,8.259210000000001e-60
Q25,-0.511455,9.140832e-211
Q75,0.066906,0.0001642021
IQR,0.618916,0.0
skew,0.036627,0.03926293
kurt,0.087195,8.869557e-07
sp.ent,0.490552,1.614016e-191
sfm,0.357499,3.877715e-96


In [50]:
central_color = px.colors.sequential.Magenta[3]

fig_corr = go.Figure()

fig_corr.add_trace(go.Bar(
    x=correlation_df.index,
    y=correlation_df['correlation'],
    name='Correlation',
    marker=dict(
        color=central_color  # Apply the central magenta color
    )
))

fig_corr.update_layout(
    title="Feature Correlations with Label",
    xaxis_title="Features",
    yaxis_title="Correlation",
    barmode='group'
)

# Show the plot
fig_corr.show()


**Best features** : interquantile range (IRQ), spectral entropy (sp.ent), spectral flatness (sfm)



# Key Features Distributions Group By Label

In [71]:

# List of variables to plot
variables = ["IQR", "sfm", "sp.ent", "meanfun"]
real_variables = ["Interquantile Range (kHz)", "Spectral Flatness", "Spectral Entropy", "Average of Fundamental Frequency"]
colors = [px.colors.sequential.Magenta[1], px.colors.sequential.Magenta[5]]  # Color for label 0 and label 1

# Create subplots (2 rows, 2 columns)
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=real_variables,
    shared_yaxes=False,
    vertical_spacing=0.1,
    horizontal_spacing=0.1
)

# Loop through the variables to create each box plot
for i, var in enumerate(variables):
    row = i // 2 + 1
    col = i % 2 + 1

    # Create a trace for label = 0
    fig.add_trace(
        go.Box(
            x=data[data["label"] == 0]["label"],
            y=data[data["label"] == 0][var],
            boxmean="sd",
            marker=dict(color=colors[0]),
            name=f"{var} (label = 0)",
            width=0.5,
            line=dict(width=2),
            boxpoints="all"
        ),
        row=row, col=col
    )

    # Create a trace for label = 1
    fig.add_trace(
        go.Box(
            x=data[data["label"] == 1]["label"],
            y=data[data["label"] == 1][var],
            boxmean="sd",
            marker=dict(color=colors[1]),
            name=f"{var} (label = 1)",
            width=0.5,
            line=dict(width=2),
            boxpoints="all"
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    title="Boxplots for Different Variables Grouped by Label",
    showlegend=False,
    template="plotly_white",
    height=900,  # Adjust height for better visibility
    width=1700,  # Adjust width for better spacing
    xaxis_title="",
    yaxis_title="",
    boxmode='group'

# Show the plot
fig.show()
