# Correlation coefficients and Principal Component Analysis

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

%matplotlib inline
%load_ext nb_black

<IPython.core.display.Javascript object>

## Dataset handling

In [2]:
df = pd.read_csv("../data/FODS-A2.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/FODS-A2.csv'

<IPython.core.display.Javascript object>

## Feature Selection using Pearson Coefficient

In [None]:
y  = df['Appliances'].to_numpy().T
df1 = df.drop('Appliances', axis = 1).to_numpy().T

In [None]:
df1.shape[0]

In [None]:
pearsonCoeff = []
for i in range(df1.shape[0]):
    pearsonCoeff.append(abs(np.corrcoef(df1[i], y)[0][1]))

In [None]:
pearsonCoeff = pd.DataFrame(pearsonCoeff, columns = ['Absolute Pearson Correlation Coeff'])
pearsonCoeff['Feature'] = df.drop('Appliances', axis = 1).columns

In [None]:
pearsonCoeff = pearsonCoeff.sort_values("Absolute Pearson Correlation Coeff", ascending=False)

In [None]:
pearsonCoeff

In [None]:
set1 = []
for i in range(df1.shape[0]):
    s = []
    for j in range(i+1):
        s.append(pearsonCoeff.iloc[j].Feature)
    set1.append(np.array(s))

In [None]:
print(len(set1))
set1 = np.array(set1)

## Feature Selection using PCA

In [None]:
def pcatransform(x, dim):
    y  = x['Appliances']
    x = x.drop('Appliances', axis = 1)
    pca = PCA(n_components = dim)
    x_pca = pca.fit_transform(x)
    x_pca = pd.DataFrame(x_pca, columns = pca.get_feature_names_out())
    x_pca['Appliances'] = y
    variance = pca.explained_variance_ratio_.cumsum()[-1]
    variance *= 100
    return x_pca, variance

## Functions for regression

In [None]:
def split(df):
    # Shuffling the dataset
    shuffled_df = df.sample(frac=1, random_state=100)

    # Define a size for your train set
    training_data_size = int(0.8 * len(df))

    # Split your dataset
    training_data = shuffled_df[:training_data_size]
    testing_data = shuffled_df[training_data_size:]
    
    X_train = training_data.drop(['Appliances'], axis=1)
    X_test = testing_data.drop(['Appliances'], axis=1)
    y_train = training_data["Appliances"].to_numpy().T
    y_test = testing_data["Appliances"].to_numpy().T
    
    return X_train, X_test, y_train, y_test

In [None]:
def batchnorm(X_train, X_test):
    mean = X_train.mean()
    stddev = X_train.std()
    X_train = (X_train - mean) / stddev
    X_test = (X_test - mean) / stddev
    X_train.to_numpy()
    X_test.to_numpy()
    return X_train, X_test

In [None]:
def initialize(dim):
    np.random.seed(42)
    b = random.random()
    theta = np.random.rand(1, dim) * 0.01
    return b, theta

In [None]:
def gradient_descent(num_epochs, X, t, theta, bias, lr):
    # forward-prop
    costs = []
    for iter in range(num_epochs):
        y = np.dot(theta, X.T) + bias

        loss = get_cost(y, t)
        # print(loss)
        costs.append(loss)

        # backprop
        der = (1 / y.shape[1]) * np.dot(y - t, X)
        dBias = (1 / y.shape[1]) * np.sum((y - t))

        # updation
        theta = theta - lr * der
        bias = bias - lr * dBias

    return costs,theta,bias

In [None]:
def get_cost(y, t):
    return 0.5 * np.sum(np.power((t - y), 2)) / y.shape[1]

In [None]:
def fwd_prop_test(X, y_test, theta, bias):
    y = np.dot(theta, X.T) + bias
    loss = get_cost(y, y_test)
    return loss

In [None]:
def batch_gd_pearson(dim, num_epochs, learning_rate):
    data = df[np.append(set1[dim-1], 'Appliances')]
    X_train, X_test, y_train, y_test = split(data)
    X_train, X_test = batchnorm(X_train, X_test)
    bias, weights = initialize(dim)
    BGD_MSE_train,weights,bias = gradient_descent(
        num_epochs, X_train, y_train, weights, bias, learning_rate
    )
    plt.plot(BGD_MSE_train)
    plt.xlabel("Epochs")
    plt.ylabel("MSE")
    plt.title("LOSS CURVE")
    plt.show()
    #plt.savefig('Training error for BGD degree {}.png'.format(degree))
    BGD_mse_test = fwd_prop_test(X_test, y_test, weights, bias)
    return BGD_mse_test, BGD_MSE_train[-1], bias, weights

In [None]:
def batch_gd_pca(dim, num_epochs, learning_rate):
    data_pca, variance = pcatransform(df, dim)
    X_train, X_test, y_train, y_test = split(data_pca)
    X_train, X_test = batchnorm(X_train, X_test)
    bias, weights = initialize(dim)
    BGD_MSE_train,weights,bias = gradient_descent(
        num_epochs, X_train, y_train, weights, bias, learning_rate
    )
    plt.plot(BGD_MSE_train)
    plt.xlabel("Epochs")
    plt.ylabel("MSE")
    plt.title("LOSS CURVE")
    plt.show()
    #plt.savefig('Training error for BGD degree {}.png'.format(degree))
    BGD_mse_test = fwd_prop_test(X_test, y_test, weights, bias)
    return BGD_mse_test, BGD_MSE_train[-1], bias, weights, variance

## Regression after Pearson Coefficient Feature Selection

In [None]:
test_mse_arr1 = []
train_mse_arr1 = []
bias_arr1 = []
weights_arr1 = []

In [None]:
for i in range(26):
    test_mse, train_mse, bias, weights = batch_gd_pearson(i+1, 700, 0.01)
    train_mse_arr1.append(train_mse)
    test_mse_arr1.append(test_mse)
    weights_arr1.append(weights)
    bias_arr1.append(bias)

In [None]:
train_mse_arr1 = pd.DataFrame(train_mse_arr1, columns = ['Train MSE'])
train_mse_arr1.index +=1
plt.plot(train_mse_arr1)
plt.xlabel("No. of Features") 
plt.ylabel("Train Error") 
plt.title("Training Error after Pearson Coefficient Feature Selection") 
plt.savefig('../results/1train.jpg')
plt.show()

In [None]:
test_mse_arr1 = pd.DataFrame(test_mse_arr1, columns = ['Test MSE'])
test_mse_arr1.index +=1
plt.plot(test_mse_arr1)
plt.xlabel("No. of Features") 
plt.ylabel("Test Error") 
plt.title("Testing Error after Pearson Coefficient Feature Selection") 
plt.savefig('../results/1test.jpg')
plt.show()

In [None]:
print("no. of features for minimum test error =", test_mse_arr1.idxmin()[0])
print("no. of features for minimum train error =", train_mse_arr1.idxmin()[0])

In [None]:
table1 = pd.concat([train_mse_arr1, test_mse_arr1], axis=1, join='inner')
table1.to_excel('../results/Results_PCC.xlsx')

## Regression after PCA

In [None]:
test_mse_arr2 = []
train_mse_arr2 = []
bias_arr2 = []
weights_arr2 = []
var = []

In [None]:
for i in range(26):
    test_mse, train_mse, bias, weights, variance = batch_gd_pca(i+1, 700, 0.01)
    train_mse_arr2.append(train_mse)
    test_mse_arr2.append(test_mse)
    weights_arr2.append(weights)
    bias_arr2.append(bias)
    var.append(variance)

In [None]:
train_mse_arr2 = pd.DataFrame(train_mse_arr2, columns = ['Train MSE'])
train_mse_arr2.index +=1
plt.plot(train_mse_arr2)
plt.xlabel("No. of Features") 
plt.ylabel("Train Error") 
plt.title("Regression Model Train Errors after PCA") 
plt.savefig('../results/2train.jpg')
plt.show()

In [None]:
test_mse_arr2 = pd.DataFrame(test_mse_arr2, columns = ['Test MSE'])
test_mse_arr2.index +=1
plt.plot(test_mse_arr2)
plt.xlabel("No. of Features") 
plt.ylabel("Test Error") 
plt.title("Regression Model Test Errors after PCA") 
plt.savefig('../results/2test.jpg')
plt.show()

In [None]:
var = pd.DataFrame(var, columns = ['Variance %'])
var.index +=1
plt.plot(var)
plt.xlabel("No. of Features") 
plt.ylabel("Variance Percentage") 
plt.title("Variance Captured vs. Number of features") 
plt.savefig('../results/2var.jpg')
plt.show()

In [None]:
var

In [None]:
print("no. of features for minimum test error =", test_mse_arr2.idxmin()[0])
print("no. of features for minimum train error =", train_mse_arr2.idxmin()[0])
print("no. of features for max variance captured =", var.idxmax()[0])

In [None]:
table2 = pd.concat([train_mse_arr2, test_mse_arr2, var], axis=1, join='inner')
table2.to_excel('../results/Results_PCA.xlsx')