In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('CC GENERAL.csv')
df.drop('CUST_ID', axis=1, inplace=True)
df['MINIMUM_PAYMENTS'] = df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].mean())
df = df.dropna(axis = 0, subset = ['CREDIT_LIMIT'])

In [9]:
class PCA ():

    def __init__(self,n_components = 0.95):
        self.n_components = n_components
        if self.n_components > 1:
            self.type = 'var'
        else:
            self.type = 'ratio'

    def fit(self,X):

        self.mean = np.mean(X, axis = 0)
        self.variance = np.var(X, axis = 0)
        # fill
        standardized_data = (X - self.mean) / np.sqrt(self.variance)
        covariance = np.round(np.cov(standardized_data, rowvar = False, bias = True), 5)
        print(covariance)
        eig_val, eig_vec = np.linalg.eig(covariance)
        if self.type == 'var':
            self.components = eig_vec[:, :self.n_components]
        else:
            i=1
            while True:
                data_var = np.sum(eig_val[:i])/np.sum(eig_val)
                if data_var >= self.n_components:
                    self.components = eig_vec[:, :i]
                    break
                i+=1
    def transform(self,Z):
        # fill
        standardized_data = (Z - self.mean) / np.std(Z, axis = 0)
        Z_new = standardized_data@self.components
        return Z_new

    #optional 
    def plot_explained_variance(self, X):
        standardized_data = (X - self.mean) / np.sqrt(self.variance)
        covariance = np.round(np.cov(standardized_data, rowvar = False, bias = True), 5)
        eig_val, eig_vec = np.linalg.eig(covariance)
        u = eig_vec[:, :self.n_components]
        pca_component = pd.DataFrame(u, index = df['feature_names'], columns = ['PC1', 'PC2'])
        plt.figure(figsize = (5, 7))
        sns.heatmap(pca_component, cmap = 'coolwarm')
        plt.title('PCA Components')
        plt.show()

In [4]:
pca_section = PCA(n_components=0.98)
X_transofmed = pca_section.fit(df.values)

[[ 1.       0.32234  0.18123  0.16433  0.12644  0.49668 -0.07805  0.07311
  -0.06327  0.44925  0.38514  0.1543   0.53128  0.32277  0.39427 -0.31902
   0.07242]
 [ 0.32234  1.       0.13359  0.10426  0.1242   0.09931  0.22944  0.20229
   0.17587  0.19202  0.14152  0.18953  0.09584  0.06484  0.11419 -0.09531
   0.11857]
 [ 0.18123  0.13359  1.       0.91684  0.67989 -0.0515   0.39299  0.49841
   0.31554 -0.12014 -0.06719  0.68955  0.35696  0.60325  0.0935   0.18036
   0.08615]
 [ 0.16433  0.10426  0.91684  1.       0.33061 -0.03134  0.26491  0.52488
   0.1277  -0.08262 -0.04623  0.54551  0.31972  0.56728  0.04858  0.13274
   0.06405]
 [ 0.12644  0.1242   0.67989  0.33061  1.      -0.06426  0.4424   0.21402
   0.51133 -0.13231 -0.07402  0.6281   0.2565   0.38407  0.13167  0.18255
   0.08602]
 [ 0.49668  0.09931 -0.0515  -0.03134 -0.06426  1.      -0.21558 -0.08679
  -0.17712  0.62853  0.65649 -0.07588  0.30399  0.45323  0.13921 -0.15296
  -0.06855]
 [-0.07805  0.22944  0.39299  0.26491  0

In [5]:
print(pca_section.components.shape)

(17, 14)


In [6]:
pca_section.transform(df.values).shape

(8949, 14)

In [10]:
pca_section.plot_explained_variance(df.values)

[[ 1.       0.32234  0.18123  0.16433  0.12644  0.49668 -0.07805  0.07311
  -0.06327  0.44925  0.38514  0.1543   0.53128  0.32277  0.39427 -0.31902
   0.07242]
 [ 0.32234  1.       0.13359  0.10426  0.1242   0.09931  0.22944  0.20229
   0.17587  0.19202  0.14152  0.18953  0.09584  0.06484  0.11419 -0.09531
   0.11857]
 [ 0.18123  0.13359  1.       0.91684  0.67989 -0.0515   0.39299  0.49841
   0.31554 -0.12014 -0.06719  0.68955  0.35696  0.60325  0.0935   0.18036
   0.08615]
 [ 0.16433  0.10426  0.91684  1.       0.33061 -0.03134  0.26491  0.52488
   0.1277  -0.08262 -0.04623  0.54551  0.31972  0.56728  0.04858  0.13274
   0.06405]
 [ 0.12644  0.1242   0.67989  0.33061  1.      -0.06426  0.4424   0.21402
   0.51133 -0.13231 -0.07402  0.6281   0.2565   0.38407  0.13167  0.18255
   0.08602]
 [ 0.49668  0.09931 -0.0515  -0.03134 -0.06426  1.      -0.21558 -0.08679
  -0.17712  0.62853  0.65649 -0.07588  0.30399  0.45323  0.13921 -0.15296
  -0.06855]
 [-0.07805  0.22944  0.39299  0.26491  0

TypeError: slice indices must be integers or None or have an __index__ method