In [3]:
# Install required packages
#!pip install yfinance pandas numpy torch transformers pickle-mixin pyarrow
!pip install faiss-cpu
!pip install yfinance 

import yfinance as yf
import pandas as pd
import numpy as np
import torch
import json
import pickle
import re
from datetime import datetime
from itertools import groupby
from operator import itemgetter
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

import warnings
warnings.filterwarnings('ignore')

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m213.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
[0mCollecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m949.2/949.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements

In [4]:
import yfinance as yf
import pandas as pd
import time

# List of 50 companies
companies = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA", "NVDA",     # Tech (7) - Clear patterns
    "JPM", "V", "MA", "BAC",                                      # Finance (4) - Interest rate sensitive  
    "WMT", "HD", "MCD", "NKE",                                   # Consumer (4) - Economic indicators
    "JNJ", "PFE", "UNH",                                         # Healthcare (3) - Defensive/growth
    "XOM", "CVX",                                                # Energy (2) - Commodity driven
    "INTC", "AMD", "ADBE", "CRM", "NFLX"                        # Tech Growth (5) - High volatility patterns
]

def process_company_data(ticker, start_date, end_date):
    """Download and process data for a single company"""
    try:
        print(f"Processing {ticker}...")

        # Download data
        df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=False)
        df.reset_index(inplace=True)

        # Clean column names
        df.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) and col[1] else col[0]
                     for col in df.columns]

        # Select and rename columns
        df = df[[
            'Date',
            f'Open_{ticker}',
            f'High_{ticker}',
            f'Low_{ticker}',
            f'Close_{ticker}',
            f'Volume_{ticker}',
            f'Adj Close_{ticker}'
        ]]

        df.rename(columns={
            f'Close_{ticker}': 'close',
            f'Open_{ticker}': 'open',
            f'High_{ticker}': 'high',
            f'Low_{ticker}': 'low',
            f'Volume_{ticker}': 'volume',
            f'Adj Close_{ticker}': 'adj_close',
            'Date': 'date'
        }, inplace=True)

        # Add ticker column
        df['ticker'] = ticker

        # Reorder columns
        df = df[['date', 'ticker', 'open', 'high', 'low', 'close', 'volume', 'adj_close']]

        # Save individual company file
        df.to_csv(f"{ticker}.csv", index=False)
        print(f"Saved {ticker}.csv")

        return df

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        return None

def process_all_companies(companies, start_date, end_date):
    """Process all companies and combine into one file"""
    all_data = []

    for ticker in companies:
        df = process_company_data(ticker, start_date, end_date)
        if df is not None:
            all_data.append(df)
            time.sleep(0.2)  # Small delay to avoid rate limiting

    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv("all_companies.csv", index=False)
        print("Saved combined data to all_companies.csv")
        return combined_df
    else:
        print("No data was processed successfully")
        return None

# Main execution
if __name__ == "__main__":
    start_date = "2022-01-01"
    end_date = "2024-12-31"

    # Process all companies
    final_df = process_all_companies(companies, start_date, end_date)

    if final_df is not None:
        print("\nSample of combined data:")
        print(final_df.head())
        print("\nData processing complete!")

Processing AAPL...


[*********************100%***********************]  1 of 1 completed


Saved AAPL.csv
Processing MSFT...


[*********************100%***********************]  1 of 1 completed


Saved MSFT.csv
Processing GOOGL...


[*********************100%***********************]  1 of 1 completed


Saved GOOGL.csv
Processing AMZN...


[*********************100%***********************]  1 of 1 completed


Saved AMZN.csv
Processing META...


[*********************100%***********************]  1 of 1 completed


Saved META.csv
Processing TSLA...


[*********************100%***********************]  1 of 1 completed


Saved TSLA.csv
Processing NVDA...


[*********************100%***********************]  1 of 1 completed


Saved NVDA.csv
Processing JPM...


[*********************100%***********************]  1 of 1 completed


Saved JPM.csv
Processing V...


[*********************100%***********************]  1 of 1 completed


Saved V.csv
Processing MA...


[*********************100%***********************]  1 of 1 completed


Saved MA.csv
Processing BAC...


[*********************100%***********************]  1 of 1 completed


Saved BAC.csv
Processing WMT...


[*********************100%***********************]  1 of 1 completed


Saved WMT.csv
Processing HD...


[*********************100%***********************]  1 of 1 completed


Saved HD.csv


[*********************100%***********************]  1 of 1 completed

Processing MCD...
Saved MCD.csv



[*********************100%***********************]  1 of 1 completed

Processing NKE...
Saved NKE.csv





Processing JNJ...


[*********************100%***********************]  1 of 1 completed


Saved JNJ.csv
Processing PFE...


[*********************100%***********************]  1 of 1 completed


Saved PFE.csv
Processing UNH...


[*********************100%***********************]  1 of 1 completed


Saved UNH.csv
Processing XOM...


[*********************100%***********************]  1 of 1 completed


Saved XOM.csv
Processing CVX...


[*********************100%***********************]  1 of 1 completed


Saved CVX.csv
Processing INTC...


[*********************100%***********************]  1 of 1 completed


Saved INTC.csv
Processing AMD...


[*********************100%***********************]  1 of 1 completed


Saved AMD.csv
Processing ADBE...


[*********************100%***********************]  1 of 1 completed


Saved ADBE.csv
Processing CRM...


[*********************100%***********************]  1 of 1 completed


Saved CRM.csv


[*********************100%***********************]  1 of 1 completed

Processing NFLX...
Saved NFLX.csv





Saved combined data to all_companies.csv

Sample of combined data:
        date ticker        open        high         low       close  \
0 2022-01-03   AAPL  177.830002  182.880005  177.710007  182.009995   
1 2022-01-04   AAPL  182.630005  182.940002  179.119995  179.699997   
2 2022-01-05   AAPL  179.610001  180.169998  174.639999  174.919998   
3 2022-01-06   AAPL  172.699997  175.300003  171.639999  172.000000   
4 2022-01-07   AAPL  172.889999  174.139999  171.029999  172.169998   

      volume   adj_close  
0  104487900  178.443115  
1   99310400  176.178406  
2   94537600  171.492081  
3   96904000  168.629288  
4   86709100  168.795975  

Data processing complete!
