In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub
import seaborn as sns
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("vinayakshanawad/heart-rate-prediction-to-monitor-stress-level")

In [3]:
p = Path(path)

train_path = p / "Train Data" / "Train Data Zip"
test_path = p / "Test Data" / "Test Zip"

train_files = list(train_path.rglob("*.csv"))
print("Arquivos de treino:", train_files)

print()

test_files = list(test_path.rglob("*.csv"))
print("Arquivos de teste:", test_files)

Arquivos de treino: [WindowsPath('C:/Users/rodwa/.cache/kagglehub/datasets/vinayakshanawad/heart-rate-prediction-to-monitor-stress-level/versions/1/Train Data/Train Data Zip/frequency_domain_features_train.csv'), WindowsPath('C:/Users/rodwa/.cache/kagglehub/datasets/vinayakshanawad/heart-rate-prediction-to-monitor-stress-level/versions/1/Train Data/Train Data Zip/heart_rate_non_linear_features_train.csv'), WindowsPath('C:/Users/rodwa/.cache/kagglehub/datasets/vinayakshanawad/heart-rate-prediction-to-monitor-stress-level/versions/1/Train Data/Train Data Zip/time_domain_features_train.csv')]

Arquivos de teste: [WindowsPath('C:/Users/rodwa/.cache/kagglehub/datasets/vinayakshanawad/heart-rate-prediction-to-monitor-stress-level/versions/1/Test Data/Test Zip/frequency_domain_features_test.csv'), WindowsPath('C:/Users/rodwa/.cache/kagglehub/datasets/vinayakshanawad/heart-rate-prediction-to-monitor-stress-level/versions/1/Test Data/Test Zip/heart_rate_non_linear_features_test.csv'), WindowsPa

In [4]:
# função para determinar o df desejado

def dataframe(dominio):
    file_desejado = [f for f in train_files if dominio in f.name.lower()][0]
    
    df_desejado = pd.read_csv(file_desejado)
    print(df_desejado.shape)
    print(df_desejado.columns)
    
    return df_desejado
    

In [5]:
# df no dominio do tempo
df_treino_tempo = dataframe("time_domain")

# df no dominio do frequencia
df_treino_freq = dataframe("frequency_domain")

# df no dominio não linear
df_treino_nao_linear = dataframe("non_linear")

(369289, 20)
Index(['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD', 'HR',
       'pNN25', 'pNN50', 'KURT', 'SKEW', 'MEAN_REL_RR', 'MEDIAN_REL_RR',
       'SDRR_REL_RR', 'RMSSD_REL_RR', 'SDSD_REL_RR', 'SDRR_RMSSD_REL_RR',
       'KURT_REL_RR', 'SKEW_REL_RR', 'uuid'],
      dtype='object')
(369289, 12)
Index(['uuid', 'VLF', 'VLF_PCT', 'LF', 'LF_PCT', 'LF_NU', 'HF', 'HF_PCT',
       'HF_NU', 'TP', 'LF_HF', 'HF_LF'],
      dtype='object')
(369289, 7)
Index(['uuid', 'SD1', 'SD2', 'sampen', 'higuci', 'datasetId', 'condition'], dtype='object')


In [6]:
# merge dos df 
df_treino_0 = df_treino_nao_linear.merge(df_treino_freq, on='uuid', how='inner',validate='one_to_one')
df_treino = df_treino_0.merge(df_treino_tempo, on='uuid', how='inner',validate='one_to_one')

In [12]:
df = df_treino.copy()

In [13]:
# removendo ID's
df.drop(['datasetId', 'uuid'], axis=1,inplace=True)

Perform an unconditional multi-variate analysis of the predictors. Specifically, you
must implement the principal components analysis (PCA) yourself without using
pre-made PCA functions or libraries. For visualisation purposes, retain only the
first two principal components (those associated with the two largest eigenvalues)
and plot the scatter plot of the projected observations. Again, for each projected
point (observation) you must use colours or symbols to indicate the associated class
label. [Remember to perform the necessary pre-processing of the data].

In [15]:
# removendo colunas-alvo
df_2 = df.drop(['condition', 'HR'], axis=1)
df_2.head(3)

Unnamed: 0,SD1,SD2,sampen,higuci,VLF,VLF_PCT,LF,LF_PCT,LF_NU,HF,...,KURT,SKEW,MEAN_REL_RR,MEDIAN_REL_RR,SDRR_REL_RR,RMSSD_REL_RR,SDSD_REL_RR,SDRR_RMSSD_REL_RR,KURT_REL_RR,SKEW_REL_RR
0,11.001565,199.061782,2.139754,1.163485,2661.894136,72.203287,1009.249419,27.375666,98.485263,15.522603,...,-0.856554,0.335218,-0.000203,-0.000179,0.01708,0.007969,0.007969,2.143342,-0.856554,0.335218
1,9.170129,114.634458,2.174499,1.084711,2314.26545,76.975728,690.113275,22.954139,99.695397,2.108525,...,-0.40819,-0.155286,-5.9e-05,0.000611,0.013978,0.004769,0.004769,2.930855,-0.40819,-0.155286
2,11.533417,118.939253,2.13535,1.176315,1373.887112,51.152225,1298.222619,48.335104,98.950472,13.769729,...,0.351789,-0.656813,-1.1e-05,-0.000263,0.018539,0.008716,0.008716,2.127053,0.351789,-0.656813


Remoção de preditores que estão altamente correlacionados

In [30]:
limiar = 0.8
preditores = [i for i in df_2.columns]
removidos = []

In [31]:
matriz_corr = df_2[preditores].corr(numeric_only=True).abs()

In [32]:
while True:
    np.fill_diagonal(matriz_corr.values, 0) # zera as diagonais
    trigSup = matriz_corr.where(np.triu(np.ones(matriz_corr.shape), k=1).astype(bool)) # trig superior

    # quando o valor maximo da triangular superior for menor que o limiar, 
    # significa que todas as colinearidades foram removidas
    if trigSup.max().max() <= limiar:
        break

    # par mais correlacionado
    A, B = trigSup.stack().idxmax()

    # calculo da média de correlações, removendo com ele prórpio e com a outra escolhida
    mA = matriz_corr.loc[A, preditores].drop([A, B]).mean()
    mB = matriz_corr.loc[B, preditores].drop([A, B]).mean()

    # indicando remover o preditor com a maior média
    drop = A if mA >= mB else B

    # salvando a mudança
    valor_corr = matriz_corr.loc[A, B]
    removidos.append((drop, (A, B), float(valor_corr)))

    # removendo
    preditores.remove(drop)
    matriz_corr = matriz_corr.drop(index=drop, columns=drop)
    

In [33]:
print(removidos)

[('SD1', ('SD1', 'SDSD'), 1.0000000000000104), ('LF_NU', ('LF_NU', 'HF_NU'), 1.000000000000001), ('KURT', ('KURT', 'KURT_REL_RR'), 1.0), ('SKEW', ('SKEW', 'SKEW_REL_RR'), 1.0), ('RMSSD_REL_RR', ('RMSSD_REL_RR', 'SDSD_REL_RR'), 0.9999999999572793), ('RMSSD', ('RMSSD', 'SDSD'), 0.9999998264707147), ('SDRR', ('SD2', 'SDRR'), 0.9999972215849533), ('HF_NU', ('HF_NU', 'HF_LF'), 0.9961753424035372), ('VLF_PCT', ('VLF_PCT', 'LF_PCT'), 0.9952395745509537), ('MEAN_RR', ('MEAN_RR', 'MEDIAN_RR'), 0.9609491736076482), ('TP', ('VLF', 'TP'), 0.9517997470476465), ('SDSD', ('SDSD', 'pNN25'), 0.9517711780340457), ('pNN25', ('LF', 'pNN25'), 0.9157888760316598), ('SD2', ('SD2', 'SDRR_RMSSD'), 0.9154748170303951), ('SDSD_REL_RR', ('SDRR_REL_RR', 'SDSD_REL_RR'), 0.8994602585830078), ('HF_PCT', ('HF_PCT', 'HF_LF'), 0.8364211887125808)]


In [35]:
for i, j, k in removidos:
    df_2.drop([i], axis=1, inplace=True)

In [37]:
print(df_2.shape)
print(df_2.columns)

(369289, 17)
Index(['sampen', 'higuci', 'VLF', 'LF', 'LF_PCT', 'HF', 'LF_HF', 'HF_LF',
       'MEDIAN_RR', 'SDRR_RMSSD', 'pNN50', 'MEAN_REL_RR', 'MEDIAN_REL_RR',
       'SDRR_REL_RR', 'SDRR_RMSSD_REL_RR', 'KURT_REL_RR', 'SKEW_REL_RR'],
      dtype='object')


Centralizando e escalonando os dados para construir a PCA

In [40]:
m = len(df_2)
n = len(preditores)

df_C_E = np.zeros((m, n))
for j in range(n):  # para cada coluna
    col_media = df_2.iloc[:, j].mean()
    col_desvio_padrao = df_2.iloc[:, j].std()
    for i in range(m):  # para cada linha
        df_C_E[i, j] = (df_2.iloc[i, j] - col_media) / col_desvio_padrao

# converte de volta pra DataFrame
df_C_E = pd.DataFrame(df_C_E, columns=df_2.columns)

In [41]:
print(df_C_E.mean(), df_C_E.std())

sampen              -8.277974e-16
higuci               3.373523e-15
VLF                  2.188013e-16
LF                   1.318598e-16
LF_PCT              -4.932574e-16
HF                  -2.111998e-16
LF_HF                2.115469e-17
HF_LF                2.539366e-16
MEDIAN_RR            1.171568e-15
SDRR_RMSSD          -2.750624e-16
pNN50               -1.240370e-15
MEAN_REL_RR         -3.929940e-18
MEDIAN_REL_RR        2.115109e-16
SDRR_REL_RR         -7.293339e-16
SDRR_RMSSD_REL_RR    1.072684e-15
KURT_REL_RR          4.822053e-17
SKEW_REL_RR          1.599319e-17
dtype: float64 sampen               1.0
higuci               1.0
VLF                  1.0
LF                   1.0
LF_PCT               1.0
HF                   1.0
LF_HF                1.0
HF_LF                1.0
MEDIAN_RR            1.0
SDRR_RMSSD           1.0
pNN50                1.0
MEAN_REL_RR          1.0
MEDIAN_REL_RR        1.0
SDRR_REL_RR          1.0
SDRR_RMSSD_REL_RR    1.0
KURT_REL_RR          1.0
SKEW_RE

In [42]:
print(df_C_E)

          sampen    higuci       VLF        LF    LF_PCT        HF     LF_HF  \
0       0.373347 -0.302405  0.254610  0.109234 -0.418915 -0.522546 -0.141218   
1       0.541197 -1.569037  0.063161 -0.446586 -0.694566 -0.818018  0.585607   
2       0.352070 -0.096115 -0.454734  0.612521  0.887760 -0.561157 -0.060125   
3       0.559757 -0.041875  0.116081  0.103543 -0.299551 -0.463969 -0.168069   
4       0.766426  1.082437 -0.577386  0.827717  1.256023  0.197587 -0.239679   
...          ...       ...       ...       ...       ...       ...       ...   
369284  0.166080  2.132382 -1.047251 -1.103180  0.880563 -0.022433 -0.298697   
369285  0.332335 -0.101574 -0.353008 -0.399262 -0.162965 -0.807768  0.450891   
369286  0.739961 -0.676295 -0.137234  2.490628  1.295188 -0.770028  1.214791   
369287  0.140856 -0.005712 -0.326742  0.575783  0.591759  0.162792 -0.245506   
369288  0.703973  0.832812 -0.432040  1.315066  1.213193  0.470285 -0.243577   

           HF_LF  MEDIAN_RR  SDRR_RMSSD

In [43]:
matriz_cov = df_C_E.cov()
print(matriz_cov)

                     sampen    higuci       VLF        LF    LF_PCT        HF  \
sampen             1.000000  0.077104 -0.290457  0.222920  0.415872 -0.171254   
higuci             0.077104  1.000000 -0.570043  0.035736  0.496831  0.434092   
VLF               -0.290457 -0.570043  1.000000  0.026521 -0.664830 -0.010441   
LF                 0.222920  0.035736  0.026521  1.000000  0.570404  0.315662   
LF_PCT             0.415872  0.496831 -0.664830  0.570404  1.000000  0.216789   
HF                -0.171254  0.434092 -0.010441  0.315662  0.216789  1.000000   
LF_HF              0.108667 -0.244251  0.126668  0.099452 -0.082040 -0.220058   
HF_LF             -0.297841  0.563333 -0.094013 -0.235807 -0.048411  0.757545   
MEDIAN_RR          0.235890 -0.461297  0.399023  0.084147 -0.262426 -0.550244   
SDRR_RMSSD        -0.680969 -0.455149  0.617610 -0.276739 -0.637903 -0.062388   
pNN50             -0.102221 -0.153064  0.529802  0.670072  0.003415  0.412257   
MEAN_REL_RR       -0.047586 

In [45]:
autovalores, autovetores = np.linalg.eig(matriz_cov)
print(autovalores, autovetores)

[4.35651805 3.49856446 3.02080098 1.28020586 1.02739703 0.88268936
 0.6944567  0.62536093 0.4498584  0.3966899  0.21247288 0.02936942
 0.06178    0.05481928 0.16467581 0.13486501 0.1094759 ] [[ 1.24251952e-02  4.25895766e-01  1.16620988e-01  2.49104331e-01
  -1.03288987e-02  5.80085965e-02  3.79498773e-01  1.98356410e-01
   4.11952406e-01 -2.80870344e-03  6.11464753e-02 -2.83368512e-02
   4.02550614e-01 -9.10565185e-02 -1.11561783e-01 -4.45991227e-01
  -4.65690576e-03]
 [ 3.86960288e-01  9.72378665e-03  1.80768208e-01  8.80814051e-02
  -3.67830443e-02  1.78475780e-01 -1.76877670e-01 -3.07268410e-01
   2.29777466e-01 -3.76810201e-01 -7.83584933e-02  2.47324393e-02
   4.50530600e-01  9.68907204e-02  2.85454336e-01  3.72539431e-01
  -1.29053188e-01]
 [-2.40626693e-01 -1.89018930e-01 -3.83370421e-01  1.57783226e-01
   2.01864198e-02  3.21523371e-02  1.46525768e-01 -2.85609559e-02
   3.25053545e-01  5.85950376e-02  3.77047920e-01  2.66394387e-01
  -1.22201931e-01  4.90725127e-02  3.61838466