# Exploratory Data Analysis


## Importing libraries

In [None]:
import pandas as pd
import numpy as np
from astropy.table.column import FORMATTER
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from src.preprocessing import *


## Loading data

In [None]:
df = pd.read_csv('../data/processed/num_imputed_df.csv')
y = pd.read_csv('../data/raw/application_train.csv')['TARGET']
# hay que ajustar la y porque tiene mas filas que df

desc = forma(df)
desc

In [None]:
df.head()

## Principal Component Analysis (PCA)

### Creation of training and test sets

In [None]:
x_train, x_test = train_test_split(df, test_size=0.2, random_state=42)

### Standardizing data

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

### PCA

In [None]:
pca = PCA(0.95)
pca.fit(x_train_scaled)
pca.n_components_

In [None]:
# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

# Create df for PCA components
pca_columns = [f'PC{i+1}' for i in range(pca.n_components_)]
x_train_pca_df = pd.DataFrame(x_train_pca, columns=pca_columns)
x_test_pca_df = pd.DataFrame(x_test_pca, columns=pca_columns)

print(x_train_pca_df.head())
print(x_test_pca_df.head())