### NVIDIA Script

In [1]:
import re
import math
import urllib3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def decoding(url: str):
    """The decoding function sends a request to the URL and it retrieves the data as shown in the front-end."""

    https = urllib3.PoolManager()
    r = https.request('GET', url)
    return r.data.decode(encoding = 'UTF-8')

def headers(results: str):
    """As soon as the data has been decoded into string, through the use regular expressions the headers function looks for the html snippet, which is set within, to save and organize headers into a list."""

    pattern = r'<th class.*?>(.*?)</th>'
    regex = re.compile(pattern)
    results = regex.findall(''.join(results), re.IGNORECASE | re.DOTALL)
    headers = []
    for item in [i for i in results if i[0:4] != '<div'][:7:1]:
        item = item.replace(' <!-- -->', '')
        headers.append(item)
    return headers

def date_values(results: str):
    """As soon as the data has been decoded into string, through the use regular expressions the date_values function looks for the html snippet, which is set within, to save and organize dates into a list."""
    
    pattern = r'<time .*?>(.*?)</time>'
    regex = re.compile(pattern)
    time = regex.findall(''.join(results), re.IGNORECASE | re.DOTALL)
    date_list = []
    for element in time:
        if element in re.findall(r'\d{2}:\d{2}:\d{2}', element):
            continue
        date_list.append(element)
    return [date for date in date_list if not len(date) < 10]

def open_vol_values(results: str):
    """As soon as the data has been decoded into string, through the use regular expressions the open_vol_values function looks for the html snippet, which is set within, to save and organize from Open to Vol. values from the URL into a list."""

    pattern = r'<td class="text-v2-black text-right text-sm font-normal leading-5 align-middle .*?>(.*?)</td>'
    regex = re.compile(pattern)
    open_values = regex.findall(''.join(results), re.IGNORECASE | re.DOTALL)
    return [open_values[i::4] for i in range(0, 4)]

def change_values(results: str):
    """As soon as the data has been decoded into string, through the use regular expressions the open_vol_values function looks for the html snippet, which is set within, to save and organize Change % values from the URL into a list."""

    pattern = r'<td class="text-v2-black text-right text-sm font-normal leading-5 align-middle .*?>(.*?)</td>'
    regex = re.compile(pattern)
    values = regex.findall(''.join(results), re.IGNORECASE | re.DOTALL)
    price_values = []
    percent_values = []
    for element in values:
        if element.split('.')[1].find('%') == -1:
            price_values.append(element)
        elif element.split('.')[1] == '00%':
            percent_values.append(element)
        else:
            percent_values.append(element)
    pattern = r'datatable_cell__LJp3C datatable_cell--align-end__qgxDQ .*?>(.*?)</td>'
    regex = re.compile(pattern)
    values = regex.findall(''.join(results), re.IGNORECASE | re.DOTALL)
    percent_values = [[element for element in values if element[0:4] != '<div'][i] for i in range(1, len([element for element in values if element[0:4] != '<div']), 2)]
    price_values = [[element for element in values if element[0:4] != '<div'][i] for i in range(0, len([element for element in values if element[0:4] != '<div']), 2)]
    
    return price_values, percent_values

def nvidia_dataframe(url: str):
    """Takes the lists from the previous functions and it generates a new DataFrame."""

    new_list = []
    results = decoding(url)
    price_values, percent_values = change_values(results)
    new_list.append(date_values(results))
    new_list.append(price_values)
    for i in range(0, 4):
        new_list.append(open_vol_values(results)[i])
    new_list.append(percent_values)
    df = pd.DataFrame(new_list, index = None).transpose()
    df.columns = headers(results)
    df[['Price', 'Open', 'High', 'Low']] = df[['Price', 'Open', 'High', 'Low']].astype(float)

    return df

### Execution
- Most recent value from NVIDIA.

In [5]:
urls = ['https://www.investing.com/equities/apple-computer-inc-historical-data', 
        'https://www.investing.com/equities/google-inc-c-historical-data',
        'https://www.investing.com/equities/facebook-inc-historical-data',
        'https://www.investing.com/equities/microsoft-corp-historical-data',
        'https://www.investing.com/equities/amazon-com-inc-historical-data']

df_ = {}
df__ = {}
header = []

for each in urls:
    header.append(each.split("/")[-1].split("-")[0])
    globals()[f'df_{each.split("/")[-1].split("-")[0]}'] = nvidia_dataframe(each)
    df_[each.split("/")[-1].split("-")[0]] = [element for element in globals()[f'df_{each.split("/")[-1].split("-")[0]}']['Price']]
    df_final_price = pd.DataFrame(df_, index = None)
    df_return =  (df_final_price/df_final_price.shift()) - 1
    df_return = df_return.dropna(axis = 0, how = 'all')
    df_standard = df_return.copy()
    df_standard_list = [df_standard[head].mean() for head in header]
    df_standard = df_standard - df_standard_list
    variance = [df_standard[head].var() for head in header]
    standard_deviation = [np.sqrt(variance) for variance in variance]
    df_variance_covariance = pd.DataFrame((np.dot(df_standard.transpose(), df_standard)/(df_standard.shape[0] - 1)), index = header, columns = header)
    expected_return = [df_return[head].mean() for head in header]
    counts = [df_return[head].count() for head in header]
    modified_sharpe = [er / std for er, std in zip(expected_return, standard_deviation)]
    stats = pd.DataFrame([expected_return, counts, standard_deviation, variance, modified_sharpe], columns = header, index = ['expected_return', 'n', 'standard_deviation', 'variance', 'modified_sharpe']) 
    correlation_matrix = df_variance_covariance / np.dot(stats.filter(items = ['standard_deviation'], axis = 0).transpose(), stats.filter(items = ['standard_deviation'], axis = 0))
    #stats = pd.concat([average_return, variance, standard_deviation])
    
# url = 'https://www.investing.com/equities/nvidia-corp-historical-data'
# df = nvidia_dataframe(url)
# df[['Date', 'Price']].query(f'Date == "{str(df["Date"][0])}"')

ValueError: Length mismatch: Expected axis has 7 elements, new values have 0 elements

In [19]:
https = urllib3.PoolManager()
r = https.request('GET', 'https://www.investing.com/equities/apple-computer-inc-historical-data')
results = str(r.data.decode(encoding = 'UTF-8'))
with open('file.txt', 'w') as file:
    file.write(results)

In [18]:
headers

[]

In [4]:
urls = 'https://www.investing.com/equities/apple-computer-inc-historical-data'

nvidia_dataframe(urls)

ValueError: Length mismatch: Expected axis has 7 elements, new values have 0 elements

In [3]:
df_variance_covariance

Unnamed: 0,apple,google,facebook,microsoft,amazon
apple,0.000142,0.000144,9e-06,6.7e-05,6.8e-05
google,0.000144,0.000445,0.000194,0.000203,0.000225
facebook,9e-06,0.000194,0.001508,0.000283,0.000674
microsoft,6.7e-05,0.000203,0.000283,0.000168,0.000207
amazon,6.8e-05,0.000225,0.000674,0.000207,0.000422


In [None]:
stats

In [None]:
correlation_matrix

In [None]:
df_return

In [None]:
df_standard

In [None]:
df_final_price

In [None]:
df_variance_covariance

In [None]:
df_final_price.to_excel('final.xlsx', index = None)

In [None]:
stats

In [None]:
df_variance_covariance/np.dot(np.array(stats.loc['standard_deviation']).T.tolist(),np.array(stats.loc['standard_deviation']).T.tolist())

In [None]:
df_standard

In [None]:
np.array(stats.loc['standard_deviation']).T.tolist()

In [None]:
np.dot(np.array(stats.loc['standard_deviation']).T.tolist(), stats.loc['standard_deviation'].tolist())

### Removing M from Vol.
- The idea is to manipulate the data by removing M from all values.

In [None]:
df.insert(int(len(df.columns) - 2), 'Vol in Million', [float('.'.join(re.findall(r'(\d+)', element))) for element in df['Vol.']])

### Final NVIDIA DataFrame

In [None]:
df

### Descriptive Stats

In [None]:
df.describe()

### Graphs

In [None]:
fig, ax = plt.subplots(int(len(df.describe().columns)), 3, sharex = False, sharey = False, figsize = (20, 20))

df = df.sort_values(by = ['Date'], ascending = True) 

for col, num in zip(df.describe().columns, range(0, int(len(df.describe().columns)))):
    ax[num, 0].plot(df['Date'], df[col])
    ax[num, 0].set_xticks([])
    ax[num, 0].set_xlabel('Date')
    ax[num, 0].set_ylabel(col)
    ax[num, 0].set_title(f'{col} Trend')
    ax[num, 1].hist(df[col], bins = round(1 + math.log2(len(df[col]))), density = True)
    ax[num, 1].plot(kind = 'density')
    ax[num, 1].set_ylabel('Frequency')
    ax[num, 1].set_title(f'{col} Histogram')
    ax[num, 2].boxplot(df[col])
    ax[num, 2].set_ylabel(col)
    ax[num, 2].set_title(f'{col} Boxplot')

fig.tight_layout(pad = 2.0)