# Initial Steps

## Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Libraries

In [None]:
import sys
import os

#Plots
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Data Processing
import pandas as pd
import numpy as np
import math

#Table Formatter
from google.colab.data_table import DataTable
DataTable.max_columns = 50

#Principal Components
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA




## Methods

### Correlation

In [None]:
#To create Pearson Correlation Matrix call the function:
#pearson_corr_df(df)


#pearson_corr method to calculate correlation between two variables with Pearson Correlation Formula
#returns the calculated correlation between two variables
def pearson_corr(col1, col2):
  if len(col1) != len(col2):
    print("The columns should have same length! Correlation calculation failed!")
    return -1
  mean_x, mean_y = col1.mean(), col2.mean()
  return sum([(col1[i]-mean_x)*(col2[i]-mean_y) for i in range(len(col1))])/math.sqrt(sum([(col1[i]-mean_x)**2 for i in range(len(col1))])*sum([(col2[i]-mean_y)**2 for i in range(len(col2))]))

#pearson_corr_df takes a df and prints shows correlation matrix with gradient method
def pearson_corr_df(df):
  labels = df.select_dtypes(include=["number"]).columns
  matrix = pd.DataFrame(0.0, index=labels, columns=labels)
  for i in matrix.index:
    for j in matrix.columns:
      matrix[i][j] = pearson_corr(df[i], df[j])
  return matrix.style.background_gradient()

In [None]:
def centralize(df):
  return df.copy() - np.mean(df.copy().T,axis=1)

In [None]:
def PCA_fromScratch(df):
  features = pd.DataFrame(scale(df), index=df.index, columns=df.columns)
  print("Covariance Matrix:\n",features.cov())
  evalue, evect = np.linalg.eig(features.cov())
  return evalue, evect, pd.DataFrame(evect.T.dot(features.T).T,columns=["PC"+str(i) for i in range(1,5)], index=features.index)

## Initial Data

In [None]:
!ls /content/gdrive/MyDrive/LowRankApproaches/Data

In [None]:
path = "/content/gdrive/MyDrive/LowRankApproaches/Data/USArrests.csv"

In [None]:
df = pd.read_csv(path,index_col = 0)
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.var()

## Correlation

In [None]:
pearson_corr_df(df)

## Principal Components

In [None]:
evalue, evect,df_plot = PCA_fromScratch(df)
df_plot

In [None]:
evalue

In [None]:
evect

In [None]:
pd.DataFrame(scale(df),index = df.index, columns=df.columns).describe()

In [None]:
df_scaled.describe()

In [None]:
df_scaled.var()

In [None]:
pd.DataFrame(scale(df),index = df.index, columns=df.columns).var()

In [None]:
X = pd.DataFrame(scale(df), index=df.index, columns=df.columns)

In [None]:
evect = pd.DataFrame(evect, index=df.columns, columns=['v1','v2','v3','v4'])
evect

In [None]:
fig , ax1 = plt.subplots(figsize=(9,7))
ax1.set_xlim(-3.5,3.5)
ax1.set_ylim(-3.5,3.5)
# Plot Principal Components 1 and 2
for i in df_plot.index:
  ax1.annotate(i, (-df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')
# Plot reference lines
ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
# Plot Principal Component loading vectors, using a second y-axis.
ax2 = ax1.twinx().twiny()
ax2.set_ylim(-1,1)
ax2.set_xlim(-1,1)
# Plot labels for vectors. Variable ’a’ is a small offset parameter to separate arrow tip and t
a = 1.07
for i in evect[["v1","v2"]].index:
  ax2.annotate(i, (-evect.v1.loc[i]*a, -evect.v2.loc[i]*a), color='red')
# Plot vectors
ax2.arrow(0,0,-evect.v1[0], -evect.v2[0])
ax2.arrow(0,0,-evect.v1[1], -evect.v2[1])
ax2.arrow(0,0,-evect.v1[2], -evect.v2[2])
ax2.arrow(0,0,-evect.v1[3], -evect.v2[3])

In [None]:
pca = PCA()
df_plot = pd.DataFrame(pca.fit_transform(X), index=X.index)

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.figure(figsize=(7,5))
plt.plot([1,2,3,4], pca.explained_variance_ratio_, "-o")
plt.ylabel("Proportion of Variance Explained")
plt.xlabel("Principal Component")
plt.xlim(0.75,4.25)
plt.ylim(0,1.05)
plt.xticks([1,2,3,4])

In [None]:
plt.figure(figsize=(7,5))
plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), "-s")
plt.ylabel("Proportion of Variance Explained")
plt.xlabel("Principal Component")
plt.xlim(0.75,4.25)
plt.ylim(0,1.05)
plt.xticks([1,2,3,4])

In [None]:
df

In [None]:
df_scaled = (df-df.mean())/df.std()


In [None]:
df_scaled

In [None]:
u, s, v = np.linalg.svd(df_scaled, full_matrices=True)

In [None]:
print(u.shape)
print(s.shape)
print(v.shape)

In [None]:
np.diag(np.vstack([np.diag(s), np.zeros((46, 4))])).shape

In [None]:
np.matmul(np.matmul(u, np.diag(np.vstack([np.diag(s), np.zeros((46, 4))]))), v)

In [None]:
var_explained = np.round(s**2/np.sum(s**2), decimals=3)
var_explained
 
sns.barplot(x=list(range(1,len(var_explained)+1)),
            y=var_explained, color="limegreen")
plt.xlabel('SVs', fontsize=16)
plt.ylabel('Percent Variance Explained', fontsize=16)
plt.savefig('svd_scree_plot.png',dpi=100)

Here is the Scree plot giving us the percentage of variance explained by each singular vector. We can see that the first vector explains most of the variation in the data.

In [None]:
labels= ['SV'+str(i) for i in range(1,4)]
svd_df = pd.DataFrame(u[:,0:3], index=df.index.tolist(), columns=labels)
svd_df=svd_df.reset_index()
svd_df.rename(columns={'index':'states'}, inplace=True)
svd_df.head()

In [None]:

# Scatter plot: SV1 and SV2
sns.scatterplot(x="SV1", y="SV2", hue="states",  
                data=svd_df, s=100,
                alpha=0.7)

plt.xlabel('SV 1: {0}%'.format(var_explained[0]*100), fontsize=16)
plt.ylabel('SV 2: {0}%'.format(var_explained[1]*100), fontsize=16)

In [None]:
len(df)

In [None]:
df.columns