# Part 1: Setup and Data Collection

This notebook handles:
- Library imports and configuration
- Data collection from Yahoo Finance
- Saving raw data for next notebooks


## 1.1 Setup and Configuration


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from pathlib import Path
from datetime import datetime

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)



## 1.2 Define Stock Tickers and Date Range


In [2]:
tech_stocks = ['AAPL', 'MSFT', 'GOOGL']
finance_stocks = ['JPM', 'BAC', 'GS']
all_tickers = tech_stocks + finance_stocks

start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')

%store tech_stocks
%store finance_stocks
%store all_tickers
%store start_date
%store end_date


Stored 'tech_stocks' (list)
Stored 'finance_stocks' (list)
Stored 'all_tickers' (list)
Stored 'start_date' (str)
Stored 'end_date' (str)


## 1.3 Download Stock Data


In [3]:
prices_list = []

for ticker in all_tickers:
    try:
        stock = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)
        if not stock.empty:
            if isinstance(stock.columns, pd.MultiIndex):
                close_price = stock['Close'][ticker] if ticker in stock['Close'].columns else stock['Close'].iloc[:, 0]
            else:
                close_price = stock['Close'] if 'Close' in stock.columns else stock.iloc[:, 0]
            
            prices_list.append(pd.Series(close_price.values, index=close_price.index, name=ticker))
    except Exception as e:
        print(f"âœ— Error: {e}")

prices_df = pd.concat(prices_list, axis=1)

## 1.4 Save Raw Data


In [4]:
data_dir = Path.cwd().parent / 'data' / 'raw'
data_dir.mkdir(parents=True, exist_ok=True)

output_path = data_dir / 'stock_prices.csv'
prices_df.to_csv(output_path)

%store prices_df


Stored 'prices_df' (DataFrame)


## 1.5 Preview Data


In [5]:
print("First 5 rows:")
display(prices_df.head())

print("\nLast 5 rows:")
display(prices_df.tail())

print("\nData Info:")
print(prices_df.info())


First 5 rows:


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,JPM,BAC,GS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,72.4683,152.5058,67.9652,119.5734,30.9203,204.1899
2020-01-03,71.7637,150.6068,67.6097,117.9955,30.2783,201.8022
2020-01-06,72.3355,150.996,69.4118,117.9016,30.2349,203.8674
2020-01-07,71.9954,149.6193,69.2777,115.8972,30.0354,205.2095
2020-01-08,73.1535,152.0025,69.7708,116.8013,30.339,207.1876



Last 5 rows:


Unnamed: 0_level_0,AAPL,MSFT,GOOGL,JPM,BAC,GS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-11-21,271.49,472.12,299.66,298.02,51.56,774.03
2025-11-24,275.92,474.0,318.58,298.0,51.93,790.71
2025-11-25,276.97,476.99,323.44,303.0,52.48,802.32
2025-11-26,277.55,485.5,319.95,307.64,52.99,816.01
2025-11-28,278.85,492.01,320.18,313.08,53.65,826.04



Data Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1486 entries, 2020-01-02 to 2025-11-28
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1486 non-null   float64
 1   MSFT    1486 non-null   float64
 2   GOOGL   1486 non-null   float64
 3   JPM     1486 non-null   float64
 4   BAC     1486 non-null   float64
 5   GS      1486 non-null   float64
dtypes: float64(6)
memory usage: 81.3 KB
None
