## EDA

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn import metrics
import scipy.stats as stats
from sklearn.metrics import mean_squared_error

df = pd.read_csv(r"C:\Users\Adam\Desktop\hw5_data.csv")

df.drop('Date', axis=1, inplace=True)
df.head()

In [None]:
print("Number of rows: " ,len(df.index))
print("Number of columns: ", len(df.columns))
df.info()

In [None]:
df.dropna(inplace=True)
df.info()

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

stats.probplot(df["Adj_Close"], dist="norm", plot=plt)
plt.title("Q-Q plot")
plt.ylabel("Adj_Close")
plt.show()

In [None]:
corMat = pd.DataFrame(df.corr())

plt.pcolor(corMat)
plt.show()

In [None]:
X, y = df.iloc[:, 1:-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = 0) 

# standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
cov_mat = np.cov(X_train_std.T)
eigen_vals,eigen_vecs = np.linalg.eig(cov_mat)
print("Eigen Values:  \n ", eigen_vals)


In [None]:
tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals,reverse = True)]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(2,31), var_exp, alpha=0.5, align='center',label='individual explained variance')
plt.step(range(2,31),cum_var_exp, where = 'mid',label = 'cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.show

In [None]:
#make list of eigenvalue, eigenvector tuples
eigen_pairs = [(np.abs(eigen_vals[i]),eigen_vecs[:,i])for i in range(len(eigen_vals))]

#sort the tuples from high to low
eigen_pairs.sort(key = lambda k: k[0],reverse = True)

In [None]:
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],eigen_pairs[1][1][:, np.newaxis]))

print('Matrix w: \n', w)

In [None]:
X_train_std[0].dot(w)
X_train_pca = X_train_std.dot(w)
colors = ['r','b','g']
markers = ['s','x','o']

for l,c,m in zip(np.unique(y_train),colors,markers):
    plt.scatter(X_train_pca[y_train==l, 0],
    X_train_pca[y_train==l, 1], c=c, label=l, marker=m)

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc = 'lower left')
plt.show

## Linear Regression


In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

lr_y_test_pred = reg.predict(X_test)
lr_y_train_pred = reg.predict(X_train)

score_train = r2_score(y_train, lr_y_train_pred)
score_test = r2_score(y_test, lr_y_test_pred)

print("R-Squared score for training: ", score_train)
print("R-Squared score for test: ", score_test)

## SVR Regressor

In [None]:
clf_svr = svm.SVR(kernel='linear')
clf_svr.fit(X_train,y_train)

SVM_y_pred_train = clf_svr.predict(X_train)
SVM_y_pred_test = clf_svr.predict(X_test)

SVM_score_train = r2_score(y_train, SVM_y_pred_train)
SVM_score_test = r2_score(y_test, SVM_y_pred_test)

print("R-Squared score for training: ", SVM_score_train)
print("R-Squared score for test: ", SVM_score_test)

## PCA Analysis - Explained variance for all components

In [None]:
pca = PCA(n_components=None)
X_train_pca = pca.fit_transform(X_train_std)
print(pca.explained_variance_ratio_)
plt.bar(range(2,31), pca.explained_variance_ratio_, alpha=0.5, align='center',label='individual explained variance')
cum_var = np.cumsum(pca.explained_variance_ratio_)
plt.step(range(2,31), cum_var, where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show()

## PCA Analysis - 3 components 

In [None]:
pca2 = PCA(n_components=3)
X_train_c3 = pca2.fit_transform(X_train_std)
features = range(pca2.n_components_)
plt.bar(features, pca2.explained_variance_ratio_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()

In [None]:
cov_mat = np.cov(X_train_pca.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
print('\nEigenvalues \n%s' % eigen_vals)
tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
plt.bar(range(1,4), var_exp, alpha=0.5, align='center', label='var_exp')
plt.step(range(1,4), cum_var_exp, where='mid', label='cumulative var_exp')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='upper right')
print("Plot for individual and cumulative explained variance for n_components=3")
plt.show()

## Linear and SVR models after dimension reduction

In [None]:
X_test_pca_c3 = pca2.transform(X_test_std)
reg.fit(X_train_c3,y_train)

y_test_pred = reg.predict(X_test_pca_c3)
y_train_pred = reg.predict(X_train_c3)

score_train = r2_score(y_train, lr_y_train_pred)
score_test = r2_score(y_test, lr_y_test_pred)

print("R-Squared score for training: ", score_train)
print("R-Squared score for test: ", score_test)

In [None]:
clf_svr = svm.SVR(kernel='linear')
clf_svr.fit(X_train_c3,y_train)

SVM_y_pred_train = clf_svr.predict(X_train_c3)
SVM_y_pred_test = clf_svr.predict(X_test_pca_c3)

SVM_score_train = r2_score(y_train, SVM_y_pred_train)
SVM_score_test = r2_score(y_test, SVM_y_pred_test)

print("R-Squared score for training: ", SVM_score_train)
print("R-Squared score for test: ", SVM_score_test)

In [None]:
print("My name is Adam Bettis")
print("My NetID is: abettis2")
print("I hereby certify that I have read the University policy on Academic Integrity and that I am not in violation.")