In [16]:
#!pip install yfinance

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from yfinance import download
from datetime import datetime
from pandas.plotting import lag_plot
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.api import OLS, add_constant
import datetime as dt

In [17]:
# Define the date range for fetching data
start_date = '2024-01-01'
end_date = '2024-12-31'

# Fetch data using yfinance
def fetch_stock_data(ticker):
    return download(ticker, start=start_date, end=end_date)

# Get FAANG data
fb = fetch_stock_data('META')
aapl = fetch_stock_data('AAPL')
amzn = fetch_stock_data('AMZN')
nflx = fetch_stock_data('NFLX')
goog = fetch_stock_data('GOOG')

# Get S&P 500 data
sp = fetch_stock_data('^GSPC')

# Get bitcoin data in USD
bitcoin = fetch_stock_data('BTC-USD')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [18]:
# Group FAANG stocks
def group_stocks(stock_dict):
    return pd.concat({key: stock["Close"] for key, stock in stock_dict.items()}, axis=1)

# Data groups
faang = group_stocks(
    {
        'Facebook': fb, 
        'Apple': aapl, 
        'Amazon': amzn, 
        'Netflix': nflx, 
        'Google': goog
    }
)

faang_sp = group_stocks(
    {
        'Facebook': fb, 
        'Apple': aapl, 
        'Amazon': amzn, 
        'Netflix': nflx, 
        'Google': goog,
        'S&P 500': sp
    }
)

all_assets = group_stocks(
    {
        'Bitcoin': bitcoin,
        'S&P 500': sp,
        'Facebook': fb, 
        'Apple': aapl, 
        'Amazon': amzn, 
        'Netflix': nflx, 
        'Google': goog
    }
)

In [19]:
# Function to clean and process datasets
def clean_dataset(df, key):
    """Clean the dataset by removing unnecessary rows and renaming columns."""
    df = df.iloc[2:].reset_index(drop=True)
    df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
    df['Ticker'] = key  # Add a column for the ticker symbol
    df['Date'] = pd.to_datetime(df['Date'])  # Convert Date to datetime
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')  # Ensure numeric columns
    return df

# Define file paths for all datasets
file_paths = {
    "Amazon": r"C:\Users\alanm\OneDrive\Documents\MADS\Capstone1\GitHub_Project\AMZN_data.csv",
    "Apple": "C:\\Users\\alanm\\OneDrive\\Documents\\MADS\\Capstone1\\GitHub_Project\\AAPL_data.csv",
    "Bitcoin": "C:\\Users\\alanm\\OneDrive\\Documents\\MADS\\Capstone1\\GitHub_Project\\BTC-USD_data",
    "Facebook": "C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07\\META_data.csv",
    "Google": "C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07\\GOOG_data.csv",
    "Netflix": "C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07\\NFLX_data.csv",
    "S&P 500": "C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07\\^GSPC_data.csv"
}

# Clean all datasets
datasets = {key: clean_dataset(pd.read_csv(path), key) for key, path in file_paths.items()}

# Define tickers to analyze
tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOG']

# Plot 1: Moving Average Plots
fig, axes = plt.subplots(3, 2, figsize=(15, 15))
for ax, ticker in zip(axes.flatten(), tickers):
    data = datasets[ticker]
    data['20D_MA'] = data['Close'].rolling(window=20).mean()  # Calculate 20-day moving average
    ax.plot(data['Date'], data['Close'], label=f'{ticker} Close')
    ax.plot(data['Date'], data['20D_MA'], linestyle='--', label='20D MA')
    ax.set_title(f'{ticker} Close and 20D MA')
    ax.legend()
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
plt.tight_layout()
plt.show()

# Plot 2: Histograms of Close Prices
fig, axes = plt.subplots(3, 2, figsize=(15, 15))
for ax, ticker in zip(axes.flatten(), tickers):
    data = datasets[ticker]
    sns.histplot(data['Close'], kde=True, ax=ax, bins=20)
    ax.set_title(f'{ticker} Close Price Distribution')
    ax.set_xlabel('Price')
    ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()

# Plot 3: Boxplot of Close Prices
plt.figure(figsize=(10, 6))
boxplot_data = [datasets[ticker]['Close'].dropna() for ticker in tickers]
plt.boxplot(boxplot_data, labels=tickers)
plt.title('Boxplot of Close Prices')
plt.xlabel('Tickers')
plt.ylabel('Price')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\alanm\\OneDrive\\Documents\\MADS\\Capstone1\\GitHub_Project\\BTC-USD_data'

In [None]:
import pandas as pd
import os

# Define the dataset directory
dataset_dir = r"C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07"

# Define file names for all datasets
file_names = {
    "Amazon": "AMZN_data.csv",
    "Apple": "AAPL_data.csv",
    "Bitcoin": "BTC-USD_data.csv",
    "Facebook": "META_data.csv",
    "Google": "GOOG_data.csv",
    "Netflix": "NFLX_data.csv",
    "S&P 500": "^GSPC_data.csv"
}

# Define a function to process each dataset and compute summary statistics
def process_data(file_path):
    # Read data
    df = pd.read_csv(file_path)
    # Ensure the necessary columns exist and clean data
    df = df.rename(columns=lambda x: x.strip())
    if 'Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Close'], errors='coerce')  # Ensure 'Close' is numeric
        return df['Close'].describe()  # Compute summary statistics for the 'Close' column

# Read and process all datasets into a group_stocks structure
all_assets = {
    name: process_data(os.path.join(dataset_dir, file_name))
    for name, file_name in file_names.items()
}

# Convert to DataFrame and structure rows and columns as specified
summary_df = pd.DataFrame(all_assets).T
summary_df.rename(
    columns={
        "count": "Count",
        "mean": "Mean",
        "std": "Std",
        "min": "Min",
        "25%": "25%",
        "50%": "50%",
        "75%": "75%",
        "max": "Max"
    },
    inplace=True
)

# Ensure the rows are named properly
summary_df.index.name = "Stock"

# Display the final summary DataFrame
from IPython.display import display
display(summary_df)

In [None]:
# Fetch data using yfinance
def fetch_stock_data(ticker):
    data = download(ticker, start=start_date, end=end_date)
    data.columns = [f"{ticker}_{col}" for col in data.columns]  # Prefix columns with ticker symbol
    return data

# Fetch data again to ensure column names are updated
fb = fetch_stock_data('META')
aapl = fetch_stock_data('AAPL')
amzn = fetch_stock_data('AMZN')
nflx = fetch_stock_data('NFLX')
goog = fetch_stock_data('GOOG')
sp = fetch_stock_data('^GSPC')
bitcoin = fetch_stock_data('BTC-USD')

# Debug: Check the updated column names
print("Flattened Columns in AAPL DataFrame:")
print(aapl.columns)

# Check for the closing price column
closing_column = None
for col in aapl.columns:
    if 'Close' in col:
        closing_column = col
        break

if closing_column is None:
    raise KeyError("Column for closing prices not found. Available columns: " + ', '.join(aapl.columns))

# Example plot: Apple stock closing prices
plt.figure(figsize=(10, 6))
sns.lineplot(x=aapl.index, y=aapl[closing_column], color="blue")
plt.title("Apple Stock Closing Prices")
plt.xlabel("Date")
plt.ylabel("Closing Price")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Define the dataset directory
dataset_dir = r"C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07"

# Define file names for all datasets
file_names = {
    "Amazon": "AMZN_data.csv",
    "Apple": "AAPL_data.csv",
    "Bitcoin": "BTC-USD_data.csv",
    "Facebook": "META_data.csv",
    "Google": "GOOG_data.csv",
    "Netflix": "NFLX_data.csv",
    "S&P 500": "^GSPC_data.csv"
}

# Load datasets and extract 'Close' prices
data = {}
for name, file_name in file_names.items():
    file_path = os.path.join(dataset_dir, file_name)
    df = pd.read_csv(file_path)
    df = df.rename(columns=lambda x: x.strip())
    if 'Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
        data[name] = df['Close']

# Create a DataFrame with all assets' 'Close' prices
close_prices = pd.DataFrame(data)

# Compute the correlation matrix
correlation_matrix = close_prices.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Stock Prices")
plt.show()

In [None]:
# Define a function to clean the datasets
def clean_dataset(filepath, column_names):
    """
    Cleans a dataset by:
    - Removing the first two rows (assumed to be metadata).
    - Renaming columns to a standardized format.
    - Converting numeric columns to proper data types.
    """
    # Load the dataset
    df = pd.read_csv(filepath)

    # Remove metadata rows and reset index
    df = df[2:].reset_index(drop=True)

    # Assign proper column names
    df.columns = column_names

    # Convert 'Date' column to datetime
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])

    # Convert numeric columns to appropriate types
    numeric_columns = [col for col in column_names if col != 'Date']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

    return df

# Define the directory for datasets and filepaths
dataset_dir = r"C:\\Users\\alanm\\OneDrive\\Documents\\Hands-On-Data-Analysis-with-Pandas-2nd-edition\\Hands-On-Data-Analysis-with-Pandas-2nd-edition-1\\ch_07"

# Filepaths and column names for each dataset
datasets_info = {
    "^GSPC": (f"{dataset_dir}\\^GSPC_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "AAPL": (f"{dataset_dir}\\AAPL_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "AMZN": (f"{dataset_dir}\\AMZN_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "BTC-USD": (f"{dataset_dir}\\BTC-USD_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "GOOG": (f"{dataset_dir}\\GOOG_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "META": (f"{dataset_dir}\\META_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
    "NFLX": (f"{dataset_dir}\\NFLX_data.csv", ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']),
}

# Clean all datasets
cleaned_data = {name: clean_dataset(filepath, columns) for name, (filepath, columns) in datasets_info.items()}

# Visualization Code
import matplotlib.pyplot as plt
import seaborn as sns

# Pairplot for S&P 500 data
sns.pairplot(cleaned_data['^GSPC'][['High', 'Low', 'Close']], diag_kind="kde", kind="scatter", corner=True)
plt.suptitle("S&P 500 Pairplot", y=1.02)
plt.show()

# Time series plot for Bitcoin data
plt.figure(figsize=(10, 6))
plt.plot(cleaned_data['BTC-USD']['Date'], cleaned_data['BTC-USD']['Close'], label='Bitcoin Close Price')
plt.title('Bitcoin Close Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid()
plt.show()

# Correlation heatmap for Amazon data
plt.figure(figsize=(8, 6))
sns.heatmap(cleaned_data['AMZN'][['Close', 'High', 'Low', 'Open', 'Volume']].corr(), annot=True, cmap='coolwarm')
plt.title('Amazon Data Correlation Heatmap')
plt.show()

# Line plot for FAANG stock prices (Close)
plt.figure(figsize=(10, 6))
for name in ['AAPL', 'AMZN', 'GOOG', 'META', 'NFLX']:
    plt.plot(cleaned_data[name]['Date'], cleaned_data[name]['Close'], label=f'{name} Close Price')
plt.title('FAANG Close Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Data inspection (example for Apple stock)
print(aapl.head())

In [None]:
# Basic statistics
print(aapl.describe())

In [None]:
# Plot settings
sns.set_theme(style="whitegrid")

In [None]:
# Fetch data using yfinance
def fetch_stock_data(ticker):
    data = download(ticker, start=start_date, end=end_date)
    data.columns = [f"{ticker}_{col}" for col in data.columns]  # Prefix columns with ticker
    return data

# Re-fetch data with updated column naming
bitcoin = fetch_stock_data('BTC-USD')

# Debug: Check column names in bitcoin DataFrame
print("Flattened Columns in Bitcoin DataFrame:")
print(bitcoin.columns)

# Adjust for closing price column
closing_column = None
for col in bitcoin.columns:
    if 'close' in col.lower():  # Match 'Close' case-insensitively
        closing_column = col
        break

if closing_column is None:
    raise KeyError("Column for closing prices not found. Available columns: " + ', '.join(bitcoin.columns))

# Example plot: Bitcoin prices
plt.figure(figsize=(10, 6))
sns.lineplot(x=bitcoin.index, y=bitcoin[closing_column], color="orange")
plt.title("Bitcoin Prices in USD")
plt.xlabel("Date")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Example correlation heatmap for FAANG stocks
faang.columns = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOG']  # Ensure correct column names
correlation_matrix = faang.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap for FAANG Stocks")
plt.show()

In [None]:
# Example pairplot for FAANG stocks
faang_viz = faang.reset_index(drop=True)  # Reset index for visualization
sns.pairplot(faang_viz, diag_kind="kde", kind="scatter", corner=True)
plt.suptitle("FAANG Stocks Pairplot", y=1.02)
plt.show()

In [None]:
# Visualizing cumulative returns
from cycler import cycler

# Black-and-white visualization cycle
bw_viz_cycler = (
    cycler(color=[plt.get_cmap('tab10')(x / 10) for x in range(10)])
    + cycler(linestyle=['dashed', 'solid', 'dashdot', 'dotted', 'solid'] * 2)
)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].set_prop_cycle(bw_viz_cycler)

# Function to calculate cumulative returns
def calculate_cumulative_returns(data):
    returns = data.pct_change().fillna(0)  # Calculate daily returns
    cumulative_returns = (1 + returns).cumprod() - 1  # Calculate cumulative returns
    return cumulative_returns

# Calculate cumulative returns for all assets
cumulative_returns = calculate_cumulative_returns(all_assets)

for name in cumulative_returns.columns:
    if name == 'Bitcoin':
        cumulative_returns[name].plot(ax=axes[1], label=name, legend=True)
    else:
        cumulative_returns[name].plot(ax=axes[0], label=name, legend=True)

fig.suptitle('Cumulative Returns')
axes[0].set_title('Non-Bitcoin Assets')
axes[1].set_title('Bitcoin')
plt.show()


In [None]:
import statsmodels.api as sm

class StockModeler:
	@staticmethod
	def decompose(data, period):
		return sm.tsa.seasonal_decompose(data['Close'], period=period)

# Perform decomposition
decomposition = StockModeler.decompose(nflx, 20) # 20 period frequency
fig = decomposition.plot()
fig.suptitle('Netflix Stock Price Time Series Decomposition', y=1)
fig.set_figheight(6)
fig.set_figwidth(10)
fig.tight_layout()

In [None]:
# Use the 'Close' column for the autocorrelation plot
from pandas.plotting import autocorrelation_plot

if 'Close' in nflx.columns:
    autocorrelation_plot(nflx['Close'])
    plt.title("Autocorrelation Plot for Netflix Closing Prices")
    plt.show()
else:
    print("The 'Close' column is not available in the Netflix DataFrame.")


In [None]:
# Lag plot for Netflix stock
data_to_plot = nflx['Close'] if 'Close' in nflx.columns else None
if data_to_plot is not None:
    lag_plot(data_to_plot)
    plt.title("Lag Plot for Netflix Closing Prices")
    plt.show()
else:
    print("The 'Close' column is not available for Netflix data.")

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Prepare the Netflix data for ARIMA modeling
if 'Close' in nflx.columns:
    nflx_close = nflx['Close'].dropna()  # Ensure no missing values
else:
    raise KeyError("The 'Close' column is not available in the Netflix data.")

# Define ARIMA parameters
ar_order = 10  # Autoregressive term
i_order = 1    # Differencing term
ma_order = 5   # Moving average term

# Fit the ARIMA model
arima_model = ARIMA(nflx_close, order=(ar_order, i_order, ma_order))
arima_result = arima_model.fit()

# Display the ARIMA model summary
print(arima_result.summary())

# Plot the fitted values
plt.figure(figsize=(10, 6))
plt.plot(nflx_close, label="Original Data")
plt.plot(arima_result.fittedvalues, label="Fitted Values", color="red")
plt.title("ARIMA Model Fitted Values")
plt.legend()
plt.show()


In [1]:
# Plot ARIMA residuals
if 'arima_model' in locals():
    residuals = arima_model.resid

    # Residual time series plot
    plt.figure(figsize=(10, 6))
    plt.plot(residuals, color='blue')
    plt.axhline(y=0, color='red', linestyle='--', linewidth=1)
    plt.title("ARIMA Model Residuals")
    plt.xlabel("Time")
    plt.ylabel("Residuals")
    plt.show()

    # Residuals distribution plot
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True, bins=30, color='blue')
    plt.title("Residuals Distribution")
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.show()
else:
    print("ARIMA model is not available.")


ARIMA model is not available.
