In [45]:

import pandas as pd
import numpy as np
from scipy import stats

In [46]:
# DataFrame creation

# Setting a seed for reproducibility
np.random.seed(0)

# Generate a date range
dates = pd.date_range(start="2023-01-01", end="2023-12-31", freq='B')  # Business days only

# Simulate data for multiple stocks
stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'FB']
data = []

for stock in stocks:
    # Randomly generate data
    closing_prices = np.random.uniform(low=100, high=500, size=len(dates))
    volumes = np.random.randint(low=1000000, high=10000000, size=len(dates))
    opening_prices = closing_prices * np.random.uniform(0.95, 1.05, size=len(dates))
    high = closing_prices * np.random.uniform(1.01, 1.1, size=len(dates))
    low = closing_prices * np.random.uniform(0.9, 0.99, size=len(dates))
    market_cap = closing_prices * np.random.uniform(10, 1000, size=len(dates))

    for i in range(len(dates)):
        data.append([dates[i], stock, closing_prices[i], volumes[i], opening_prices[i], high[i], low[i], market_cap[i]])

df = pd.DataFrame(data, columns=['Date', 'Stock', 'ClosingPrice', 'Volume', 'OpeningPrice', 'High', 'Low', 'MarketCap'])

# Displaying the first few rows of the DataFrame
print(df)

# Date: The date of the trading session.
# Stock: The stock symbol.
# ClosingPrice: The closing price of the stock on that day.
# Volume: The number of shares traded during the day.
# OpeningPrice: The opening price of the stock on that day.
# High: The highest price of the stock during the trading session.
# Low: The lowest price of the stock during the trading session.
# MarketCap: The market capitalization of the stock (in billions).


           Date Stock  ClosingPrice   Volume  OpeningPrice        High  \
0    2023-01-02  AAPL    319.525402  7365385    314.205542  348.046568   
1    2023-01-03  AAPL    386.075747  3311949    402.451488  422.773177   
2    2023-01-04  AAPL    341.105350  4455865    353.464230  374.966923   
3    2023-01-05  AAPL    317.953273  5130630    303.603733  331.913557   
4    2023-01-06  AAPL    269.461920  7131367    262.823524  295.587134   
...         ...   ...           ...      ...           ...         ...   
1295 2023-12-25    FB    112.681285  9354716    107.340532  119.644133   
1296 2023-12-26    FB    404.637659  1781596    411.787669  414.428937   
1297 2023-12-27    FB    422.601371  6113914    415.180216  429.553350   
1298 2023-12-28    FB    468.192240  6303145    485.953772  498.678280   
1299 2023-12-29    FB    135.166227  7155363    130.346306  137.467395   

             Low      MarketCap  
0     309.953972   95619.763051  
1     376.945470  379574.311612  
2     322

In [52]:
# Estimates of Variability

# Variability is also referred to as dispersion.

aapl_volume = df.loc[df['Stock']== 'AAPL',['Date','Stock','Volume']]
aapl_volume

Unnamed: 0,Date,Stock,Volume
0,2023-01-02,AAPL,7365385
1,2023-01-03,AAPL,3311949
2,2023-01-04,AAPL,4455865
3,2023-01-05,AAPL,5130630
4,2023-01-06,AAPL,7131367
...,...,...,...
255,2023-12-25,AAPL,9337019
256,2023-12-26,AAPL,2425692
257,2023-12-27,AAPL,7583960
258,2023-12-28,AAPL,3555587


In [None]:
# Summary Statistics

# mean
aapl_mean = np.mean(aapl_volume['Volume'])
print('AAPL mean:', aapl_mean)

# Standard deviation measure of the average distance of each data point
# from the mean
print('Statistics Sumary: \n',aapl_volume.describe())

# Variance is the average of the squared differences from the mean. It quantifies
# the degree of variation or spread in a set of data values.
aapl_variance = aapl_volume['Volume'].var()
print('variance:', aapl_variance)

# MAD calculates how far, on average, each data point is from the mean.
aapl_mad = np.mean(np.abs(aapl_volume['Volume'] - aapl_mean))
print('mad:', aapl_mad)

# Median Absolute Deviation  Centers around the median and calculates the typical
# (median) distance from the median
median = np.median(aapl_volume['Volume'])
median_absolute_deviation = np.median(np.abs(aapl_volume['Volume'] - median))
print('median_absolute_deviation:', median_absolute_deviation)

# - Standard Deviation (std): Focuses on extremes. It gives more weight to data
# points that are far from the mean (average). This is because it squares the
# differences from the mean before averaging them. So, larger deviations have a
# disproportionately greater impact on the standard deviation. In a sense,
# standard deviation is particularly sensitive to outliers or extreme values.

# - Mean Absolute Deviation (MAD): Focuses on the average distance. It calculates
# the average of the absolute differences from the mean. This means it treats all
# deviations from the mean equally, regardless of their size. By taking the
# absolute value of these differences, MAD provides a straightforward measure
# of variability that does not unduly emphasize outliers.

AAPL mean: 5442656.576923077
Statistics Sumary: 
                       Date        Volume
count                  260  2.600000e+02
mean   2023-07-01 12:00:00  5.442657e+06
min    2023-01-02 00:00:00  1.001337e+06
25%    2023-04-02 06:00:00  3.172438e+06
50%    2023-07-01 12:00:00  5.544756e+06
75%    2023-09-29 18:00:00  7.706712e+06
max    2023-12-29 00:00:00  9.982554e+06
std                    NaN  2.631123e+06
variance: 6922806324050.209
mad: 2272527.8855029587
median_absolute_deviation: 2259824.5
