In [None]:
#Imports iniciales
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from bioinfokit.visuz import cluster
from pca import getPrincipalComponent

In [None]:
#Proceso de datos del csv
df=pd.read_csv('europe.csv')
dfWNames=df.copy()
df = df[df.columns[1:8]]
df.head()

In [None]:
#Boxplot de caracteristicas de los paises (sin estandarizar)
sns.boxplot(data=df)
plt.show()

In [None]:
#Boxplot de caracteristicas de los paises (estandarizado)
x=StandardScaler().fit_transform(df)
namedBoxplot= pd.DataFrame(data=x,columns=["Area","GDP","Inflation","Life. expect","Military","Pop. growth","Unemployment"])
sns.boxplot(data=namedBoxplot)
plt.show()

In [None]:
#Calculo de componentes principales
pca=PCA(n_components=7)
PC=pca.fit_transform(x)

In [None]:
#Calcular las componentes principales de cada pais

countries = []
principalComponents = np.zeros((28,7))
countriesData = dict()

for i in range(0,28):
    countries.append(dfWNames.iloc[:,0][i])
    for j in range(0,7):
        principalComponents[i][j]=getPrincipalComponent(j,pca.components_,PC[i])

for j in range(0,7):
    countriesData['PC'+str(j+1)] = principalComponents[:,j]

countriesDF = pd.DataFrame(countriesData, index=countries)

#Imprimimos las componentes principales de cada pais

countriesDF.head(28)

In [None]:
#Pie Chart de proporcion de varianza total para cada componente
plt.pie(x=pca.explained_variance_ratio_,labels=[f"PC{i+1}" for i in range(0,7)])
plt.show()

In [None]:
#Tabla de valores de caracteristicas de la primera componente por pais

firstComponentCharacteristics = np.zeros((28,7))
countriesFirstComponentCharacteristicsData = dict()

for i in range(0,28):
    for j in range(0,7):
        #principalComponents[i][j]=getPrincipalComponent(j,pca.components_,PC[i])
        firstComponentCharacteristics[i][j] = pca.components_[0][j]*PC[i][j]

for j in range(0,7):
    countriesFirstComponentCharacteristicsData[df.columns[j]] = firstComponentCharacteristics[:,j]

countriesFirstComponentCharacteristicsDataDF = pd.DataFrame(countriesFirstComponentCharacteristicsData, index=countries)

#Imprimimos las caracteristicas de cada pais correspondientes a la primera componetne

countriesFirstComponentCharacteristicsDataDF.head(28)


In [None]:
#Barplot de coeficientes de PC1
sns.set_style({'axes.grid':True})
coefs = np.array(pca.components_[0])
sns.barplot(y=coefs,x=["Area","GDP","Inflation","Life. expect","Military","Pop. growth","Unemployment"],hue=[1 if c >= 0 else 0 for c in coefs],palette=sns.color_palette(),orient="v").set(title="Coefficients")
plt.show()

In [None]:
#2D Biplot
cluster.biplot(cscore=PC, loadings=pca.components_, labels=df.columns,
    var1=round(pca.explained_variance_ratio_[0]*100, 2), var2=round(pca.explained_variance_ratio_[1]*100, 2),show=True)



In [None]:
#3D Biplot
cluster.biplot(cscore=PC, loadings=pca.components_, labels=df.columns, 
    var1=round(pca.explained_variance_ratio_[0]*100, 2), var2=round(pca.explained_variance_ratio_[1]*100, 2), 
    var3=round(pca.explained_variance_ratio_[2]*100, 2),show=True)


In [None]:
#Barplot de coeficientes de PC1, con componentes principales calculadas sin estandarizar los datos

#Calculo de componentes principales
pcaNotStandarized=PCA(n_components=7)
PCNotStandarized=pcaNotStandarized.fit_transform(df)

#Creacion del barplot
sns.set_style({'axes.grid':True})
coefsNotStandarized = np.array(pcaNotStandarized.components_[0])
sns.barplot(y=coefsNotStandarized,x=["Area","GDP","Inflation","Life. expect","Military","Pop. growth","Unemployment"],hue=[1 if c >= 0 else 0 for c in coefs],palette=sns.color_palette(),orient="v").set(title="Coefficients (Data not standarized)")
plt.show()


