In [None]:
import numpy as np 
import os 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

In [None]:
# Correlation matrix
def plot_corr_matrix(df, g_width):
    file_name = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(g_width, g_width), dpi=80, facecolor='w', edgecolor='k')
    corr_matrix = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corr_matrix)
    plt.title(f'Correlation Matrix for {file_name}', fontsize=15)
    plt.show()

In [None]:
# Scatter and density plots
def plot_scatter_mat(df, plot_size, text_size):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    ## drop nan values
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    column_names = list(df)
    if len(column_names) > 10:
        column_names = column_names[:10]
    df = df[column_names]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plot_size, plot_size], diagonal='kde')
    corr = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corr[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=text_size)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
n_rows = 1000

df = pd.read_csv('Stock Prediction Data/all_stocks_5yr.csv', delimiter=',', nrows = n_rows)
df.dataframeName = 'all_stocks_5yr.csv'

In [None]:
df.shape

In [None]:
df.head()

In [None]:
plot_corr_matrix(df1, 8)

In [None]:
plot_scatter_mat(df, 15, 10)

## 2nd Dataset

In [None]:
n_rows = 1000

df_1 = pd.read_csv(r'file-name', delimiter=',', nrows = n_rows)
df_1.dataframeName = 'data.csv'

In [None]:
df_1.head()

In [None]:
df_1.shape

In [None]:
plot_corr_matrix(df_1, 8)

In [None]:
plot_scatter_mat(df_1, 15, 10)