In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
X = data.iloc[:,2:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
groups = np.array(data['Subject_ID'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X

In [None]:
print(sorted(pca.explained_variance_ratio_, reverse=True))

In [None]:
features = ['Proportion speech child', 'Overlapping speech', 'Avg OS', 'Std OS',
        'overall speech', 'intervals/min', 'avg interval duration',
        'std interval duration', 'avg silence duration',
        'std silence duration']

In [None]:
n_components = 2
pca = PCA(n_components=n_components)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

labels = {str(i): f"PC {i+1}" for i in range(n_components)}
labels['label'] = 'diagnosis'

fig = px.scatter_matrix(
    components,
    color=data.label,
    dimensions=range(n_components),
    labels=labels,
    title=f'Total Explained Variance: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)
fig.show()


In [None]:
pca.explained_variance_ratio_

In [None]:
pca = PCA()
pca.fit(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
pca = PCA(n_components=4)
components = pca.fit_transform(X)

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

fig = px.scatter(components, x=0, y=1, color=data['label'])

for i, feature in enumerate(features):
    fig.add_shape(
        type='line',
        x0=0, y0=0,
        x1=loadings[i, 0],
        y1=loadings[i, 1]
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
    )
fig.show()

In [None]:
print(pd.DataFrame(pca.components_,columns=features,index = ['PC-1','PC-2', 'PC-3', 'PC-4', 'PC-5']))

In [None]:


pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=data['label'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('features.xlsx',sheet_name='Play' ,engine='openpyxl')

In [None]:
import seaborn as sns
g = sns.displot(
    data=data[['Proportion speech child', 'label']],
    x="Proportion speech child", col="label",
     fill=True, palette=sns.color_palette('bright')[:2], height=7, aspect=1)

g.despine(left=True)
g.set_axis_labels("", "child speech")
#g.legend.set_title(" ")