In [None]:
import time as t
from twelvedata import TDClient
import pandas as pd
from datetime import datetime, timedelta, time
import os
from google.colab import drive

# Initialize the client
td = TDClient(apikey=api_key)

# Mount Google Drive
drive.mount('/content/drive')

def fetch_stock_data(symbol):
    # Define the date range
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2024, 12, 31)

    # Define market hours
    market_open_time = time(9, 30)  # 9:30 AM
    market_close_time = time(16, 0)  # 4:00 PM

    all_data = []

    # Iterate through each month in the date range
    current_date = start_date
    while current_date <= end_date:
        # Calculate the last day of the current month
        if current_date.month == 12:
            next_month = current_date.replace(year=current_date.year + 1, month=1, day=1)
        else:
            next_month = current_date.replace(month=current_date.month + 1, day=1)
        last_day_of_month = next_month - timedelta(days=1)

        # Define the start and end times for the entire month
        month_start = datetime.combine(current_date, market_open_time)
        month_end = datetime.combine(last_day_of_month, market_close_time)

        # Fetch data for the entire month
        try:
            print(f"Fetching {symbol} data for {current_date.strftime('%Y-%m')}...")
            data = td.time_series(
                symbol=symbol,
                interval="15min",
                start_date=month_start.strftime("%Y-%m-%d %H:%M:%S"),
                end_date=month_end.strftime("%Y-%m-%d %H:%M:%S"),
                outputsize=5000
            ).as_pandas()

            if not data.empty:
                all_data.append(data)
                print(f"Successfully fetched {len(data)} rows for {symbol} in {current_date.strftime('%Y-%m')}.")
            else:
                print(f"No data returned for {symbol} in {current_date.strftime('%Y-%m')} (market may have been closed).")

        except Exception as e:
            print(f"Error fetching {symbol} data for {current_date.strftime('%Y-%m')}: {e}")

        # Move to the next month
        current_date = next_month

        # Add delay between requests to avoid hitting API limits
        t.sleep(10)  # 10-second delay (6 requests per minute)

    # Combine all chunks into a single DataFrame
    if all_data:
        full_data = pd.concat(all_data)
        # Remove duplicates (if any)
        full_data = full_data[~full_data.index.duplicated(keep='first')]
        # Save to Google Drive
        full_data.to_csv(f'/content/drive/My Drive/{symbol}_data.csv', index=True)
        print(f"{symbol} data saved to Google Drive as {symbol}_data.csv")
    else:
        print(f"No data collected for {symbol}")

# Pull data for AAPL and MSFT
fetch_stock_data("AAPL")
fetch_stock_data("MSFT")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fetching AAPL data for 2021-01...
Successfully fetched 494 rows for AAPL in 2021-01.
Fetching AAPL data for 2021-02...
Successfully fetched 494 rows for AAPL in 2021-02.
Fetching AAPL data for 2021-03...
Successfully fetched 598 rows for AAPL in 2021-03.
Fetching AAPL data for 2021-04...
Successfully fetched 546 rows for AAPL in 2021-04.
Fetching AAPL data for 2021-05...
Successfully fetched 520 rows for AAPL in 2021-05.
Fetching AAPL data for 2021-06...
Successfully fetched 572 rows for AAPL in 2021-06.
Fetching AAPL data for 2021-07...
Successfully fetched 546 rows for AAPL in 2021-07.
Fetching AAPL data for 2021-08...
Successfully fetched 572 rows for AAPL in 2021-08.
Fetching AAPL data for 2021-09...
Successfully fetched 546 rows for AAPL in 2021-09.
Fetching AAPL data for 2021-10...
Successfully fetched 546 rows for AAPL in 2021-10.
Fetching AAPL data fo

In [7]:
import time as t
from twelvedata import TDClient
import pandas as pd
from datetime import datetime, timedelta, time
import os
from google.colab import drive
from tqdm import tqdm

# Initialize TwelveData client
td = TDClient(apikey=api_key)
drive.mount('/content/drive')

# API constraints
MAX_DAILY_REQUESTS = 800
MAX_MINUTE_REQUESTS = 8
DATA_LIMIT_PER_REQUEST = 5000

# Track API usage
api_usage = 452  # Update this dynamically if needed

def fetch_chunk(symbol, start_dt, end_dt, retries=3):
    """Fetch data chunk with retry logic, ensuring API constraints."""
    global api_usage

    if api_usage >= MAX_DAILY_REQUESTS:
        print("Daily API limit reached. Stopping execution.")
        return pd.DataFrame()

    for attempt in range(retries):
        try:
            data = td.time_series(
                symbol=symbol,
                interval="15min",
                start_date=start_dt.strftime("%Y-%m-%d %H:%M:%S"),
                end_date=end_dt.strftime("%Y-%m-%d %H:%M:%S"),
                outputsize=DATA_LIMIT_PER_REQUEST
            ).as_pandas()

            api_usage += 1  # Increment usage counter
            return data[data.index >= start_dt]  # Trim overlap
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            t.sleep(min(8 * (attempt + 1), 60))  # Exponential backoff, capped at 60s

    return pd.DataFrame()

def fetch_stock_data(symbol):
    """Fetch full stock data while respecting API rate limits."""
    global api_usage

    market_open = time(9, 30)
    market_close = time(16, 0)
    chunk_days = 14
    overlap = timedelta(minutes=30)

    all_data = []
    current_dt = datetime(2021, 1, 1)
    end_dt = datetime(2024, 12, 31)

    # Find first available trading day
    while current_dt <= end_dt:
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=1)
            continue

        test_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(current_dt, market_close),
            retries=1
        )
        if not test_data.empty:
            break
        current_dt += timedelta(days=1)

    pbar = tqdm(total=(end_dt - current_dt).days, desc=f"Fetching {symbol}")

    while current_dt <= end_dt and api_usage < MAX_DAILY_REQUESTS:
        chunk_end = min(current_dt + timedelta(days=chunk_days), end_dt)

        chunk_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(chunk_end, market_close) + overlap,
        )

        if not chunk_data.empty:
            all_data.append(chunk_data)
            pbar.update((chunk_end - current_dt).days)
            current_dt = chunk_data.index[-1].to_pydatetime() - overlap
        else:
            current_dt += timedelta(days=1)

        # Skip weekends efficiently
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=7 - current_dt.weekday())

        # Ensure we don't exceed per-minute limit
        if api_usage % MAX_MINUTE_REQUESTS == 0:
            t.sleep(61)  # Sleep slightly over 1 min to reset API window

    pbar.close()

    if all_data:
        final_data = pd.concat(all_data)
        final_data = final_data[~final_data.index.duplicated(keep='last')]
        final_data.sort_index(inplace=True)
        save_path = f'/content/drive/My Drive/{symbol}_data.csv'
        final_data.to_csv(save_path)
        print(f"Saved {len(final_data)} records for {symbol} at {save_path}")

# Fetch data for both stocks
fetch_stock_data("AAPL")


In [9]:
import time as t
from twelvedata import TDClient
import pandas as pd
from datetime import datetime, timedelta, time
import os
from google.colab import drive
from tqdm import tqdm

# Initialize TwelveData client
td = TDClient(apikey=api_key)
drive.mount('/content/drive')

# API constraints
MAX_DAILY_REQUESTS = 800
MAX_MINUTE_REQUESTS = 8
DATA_LIMIT_PER_REQUEST = 5000

# Track API usage
api_usage = 452  # Update this dynamically if needed

def fetch_chunk(symbol, start_dt, end_dt, retries=3):
    """Fetch data chunk with retry logic, ensuring API constraints."""
    global api_usage

    if api_usage >= MAX_DAILY_REQUESTS:
        print("Daily API limit reached. Stopping execution.")
        return pd.DataFrame()

    for attempt in range(retries):
        try:
            data = td.time_series(
                symbol=symbol,
                interval="15min",
                start_date=start_dt.strftime("%Y-%m-%d %H:%M:%S"),
                end_date=end_dt.strftime("%Y-%m-%d %H:%M:%S"),
                outputsize=DATA_LIMIT_PER_REQUEST
            ).as_pandas()

            api_usage += 1  # Increment usage counter
            return data[data.index >= start_dt]  # Trim overlap
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            t.sleep(min(8 * (attempt + 1), 60))  # Exponential backoff, capped at 60s

    return pd.DataFrame()

def fetch_stock_data(symbol):
    """Fetch full stock data while respecting API rate limits."""
    global api_usage

    market_open = time(9, 30)
    market_close = time(16, 0)
    chunk_days = 14
    overlap = timedelta(minutes=30)

    all_data = []
    current_dt = datetime(2021, 1, 1)
    end_dt = datetime(2024, 12, 31)

    # Find first available trading day
    while current_dt <= end_dt:
        if current_dt.weekday() >= 5:  # Skip weekends
            current_dt += timedelta(days=1)
            continue

        test_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(current_dt, market_close),
            retries=1
        )
        if not test_data.empty:
            break
        current_dt += timedelta(days=1)

    # Calculate total calendar days (simpler approach)
    total_days = (end_dt - current_dt).days
    if total_days <= 0:
        print(f"No valid data available for {symbol}. Exiting.")
        return

    pbar = tqdm(total=total_days, desc=f"Fetching {symbol}")

    while current_dt <= end_dt and api_usage < MAX_DAILY_REQUESTS:
        # Skip weekends
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=1)
            pbar.update(1)
            continue

        chunk_end = min(current_dt + timedelta(days=chunk_days), end_dt)

        chunk_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(chunk_end, market_close) + overlap,
        )

        if not chunk_data.empty:
            all_data.append(chunk_data)
            # Update progress by actual days processed (simple approach)
            days_processed = (chunk_end - current_dt).days
            pbar.update(days_processed)
            current_dt = chunk_end
        else:
            # If no data, just move forward by chunk_days
            pbar.update(chunk_days)
            current_dt += timedelta(days=chunk_days)

        # API rate limiting
        if api_usage % MAX_MINUTE_REQUESTS == 0:
            t.sleep(61)

    pbar.close()

    if all_data:
        final_data = pd.concat(all_data)
        final_data = final_data[~final_data.index.duplicated(keep='last')]
        final_data.sort_index(inplace=True)
        save_path = f'/content/drive/My Drive/{symbol}_data.csv'
        final_data.to_csv(save_path)
        print(f"Saved {len(final_data)} records for {symbol} at {save_path}")

    print(f"Fetching complete for {symbol} ✅")

# Fetch data for both stocks
fetch_stock_data("AAPL")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Attempt 1 failed: No data is available on the specified dates. Try setting different start/end dates.






Fetching AAPL:   0%|          | 0/1457 [00:00<?, ?it/s][A[A[A[A



Fetching AAPL:   1%|          | 14/1457 [00:00<01:09, 20.79it/s][A[A[A[A



Fetching AAPL:   2%|▏         | 28/1457 [00:01<01:08, 20.98it/s][A[A[A[A



Fetching AAPL:   3%|▎         | 42/1457 [00:01<01:07, 21.11it/s][A[A[A[A



Fetching AAPL:   3%|▎         | 42/1457 [00:16<01:07, 21.11it/s][A[A[A[A



Fetching AAPL:   4%|▍         | 56/1457 [01:03<41:20,  1.77s/it][A[A[A[A



Fetching AAPL:   5%|▍         | 70/1457 [01:04<26:38,  1.15s/it][A[A[A[A



Fetching AAPL:   6%|▌         | 84/1457 [01:05<17:51,  1.28it/s][A[A[A[A



Fetching AAPL:   7%|▋         | 98/1457 [01:06<12:19,  1.84it/s][A[A[A[A



Fetching AAPL:   8%|▊         | 112/1457 [01:07<08:44,  2.57it/s][A[A[A[A



Fetching AAPL:   9%|▊         | 126/1457 [01:07<06:20,  3.50it/s][A[A[A[A



Fetching AAPL:  10%|▉         | 140/1457 [01:09<05:08,  4.28it/s][A[A[A[A



Fetching AAPL:  11%|█         | 154/1457 [

Attempt 1 failed: You have run out of API credits for the day. 801 API credits were used, with the current limit being 800. Wait for the next day or consider switching to a paid plan that will remove daily limits at https://twelvedata.com/pricing
Attempt 2 failed: You have run out of API credits for the day. 802 API credits were used, with the current limit being 800. Wait for the next day or consider switching to a paid plan that will remove daily limits at https://twelvedata.com/pricing
Attempt 3 failed: You have run out of API credits for the day. 803 API credits were used, with the current limit being 800. Wait for the next day or consider switching to a paid plan that will remove daily limits at https://twelvedata.com/pricing






Fetching AAPL: 1471it [42:51,  1.75s/it]


Saved 26074 records for AAPL at /content/drive/My Drive/AAPL_data.csv
Fetching complete for AAPL ✅


Run this code tomorrow to see if it works

In [None]:
def fetch_stock_data(symbol):
    """Fetch full stock data while respecting API rate limits."""
    global api_usage

    market_open = time(9, 30)
    market_close = time(16, 0)
    chunk_days = 14
    overlap = timedelta(minutes=30)

    all_data = []
    current_dt = datetime(2021, 1, 1)
    end_dt = datetime(2024, 12, 31)

    # Find first available trading day
    while current_dt <= end_dt:
        if current_dt.weekday() >= 5:  # Skip weekends
            current_dt += timedelta(days=1)
            continue

        test_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(current_dt, market_close),
            retries=1
        )
        if not test_data.empty:
            break
        current_dt += timedelta(days=1)

    # Calculate total calendar days
    total_days = (end_dt - current_dt).days + 1  # +1 to include both start and end dates
    if total_days <= 0:
        print(f"No valid data available for {symbol}. Exiting.")
        return

    pbar = tqdm(total=total_days, desc=f"Fetching {symbol}")
    processed_days = 0

    while current_dt <= end_dt and api_usage < MAX_DAILY_REQUESTS and processed_days < total_days:
        # Skip weekends
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=1)
            pbar.update(1)
            processed_days += 1
            continue

        chunk_end = min(current_dt + timedelta(days=chunk_days - 1), end_dt)  # -1 because we include current day

        chunk_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(chunk_end, market_close) + overlap,
        )

        days_in_chunk = (chunk_end - current_dt).days + 1  # +1 to include both start and end dates

        if not chunk_data.empty:
            all_data.append(chunk_data)
            pbar.update(days_in_chunk)
            processed_days += days_in_chunk
            current_dt = chunk_end + timedelta(days=1)  # Move to next day after chunk_end
        else:
            # If no data, move forward by chunk days
            pbar.update(days_in_chunk)
            processed_days += days_in_chunk
            current_dt = chunk_end + timedelta(days=1)

        # API rate limiting
        if api_usage % MAX_MINUTE_REQUESTS == 0:
            t.sleep(61)

        # Print status every 100 API calls
        if api_usage % 100 == 0:
            print(f"API calls used: {api_usage}, Current date: {current_dt.strftime('%Y-%m-%d')}")

    pbar.close()

    if all_data:
        final_data = pd.concat(all_data)
        final_data = final_data[~final_data.index.duplicated(keep='last')]
        final_data.sort_index(inplace=True)
        save_path = f'/content/drive/My Drive/{symbol}_data.csv'
        final_data.to_csv(save_path)
        print(f"Saved {len(final_data)} records for {symbol} at {save_path}")

    if current_dt > end_dt:
        print(f"Successfully processed all dates for {symbol} ✅")
    elif api_usage >= MAX_DAILY_REQUESTS:
        print(f"Stopped due to reaching API limit at {current_dt.strftime('%Y-%m-%d')} ⚠️")
    else:
        print(f"Unexpected exit condition at {current_dt.strftime('%Y-%m-%d')}")

# Fetch data for both stocks
fetch_stock_data("MSFT")

## [2 points] How many API credits would be required for downloading 4 years worth of data at 15-minute intervals, per stock?

## [3 points] The API applies a rate limit. How long would it take to download 4 years worth of data at 15-minute intervals, per stock?

## [20 points for setting up data collection] We’ll be working with just 2 stocks: AAPL and MSFT. The need for speed in a price API makes it inevitable that a small percentage of data elements has errors in it. To compensate for such errors, a “good enough” strategy is to request data for overlapping periods, say fetching 30 minutes of data every 15 minutes – replacing the most recently arrived data with a corrected version.

Below is the setup for using the api to gather the stock data. The full script I ran to gather the data using the API is under the following question.

In [2]:
!pip install twelvedata
import twelvedata

from google.colab import drive
import sys
drive.mount("/content/drive/", force_remount=True)
sys.path.append("/content/drive/My Drive/Colab Notebooks")
from key import twelveDataKey as api_key

Collecting twelvedata
  Downloading twelvedata-1.2.24-py2.py3-none-any.whl.metadata (19 kB)
Collecting pytimeparse<2,>=1.1 (from twelvedata)
  Downloading pytimeparse-1.1.8-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading twelvedata-1.2.24-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytimeparse-1.1.8-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: pytimeparse, twelvedata
Successfully installed pytimeparse-1.1.8 twelvedata-1.2.24
Mounted at /content/drive/


## [25 points for collecting data] Technically, final testing can only be done while the market is open: 9:30 am to 4:00 pm on weekdays. In a compromise to accommodate everyone’s schedules we will pull the stock data of each stock and gather up at least 4 years of stock price data (January 2021-December 2024) and use it as the basis of our work.


Below is the script I ran for getting the data and storing them in csv files.

In [None]:
import time as t
from twelvedata import TDClient
import pandas as pd
from datetime import datetime, timedelta, time
import os
from google.colab import drive
from tqdm import tqdm

# Initialize TwelveData client
td = TDClient(apikey=api_key)
drive.mount('/content/drive')

# API constraints
MAX_DAILY_REQUESTS = 800
MAX_MINUTE_REQUESTS = 8
DATA_LIMIT_PER_REQUEST = 5000

# Track API usage
api_usage = 452  # Update this dynamically if needed

def fetch_chunk(symbol, start_dt, end_dt, retries=3):
    """Fetch data chunk with retry logic, ensuring API constraints."""
    global api_usage

    if api_usage >= MAX_DAILY_REQUESTS:
        print("Daily API limit reached. Stopping execution.")
        return pd.DataFrame()

    for attempt in range(retries):
        try:
            data = td.time_series(
                symbol=symbol,
                interval="15min",
                start_date=start_dt.strftime("%Y-%m-%d %H:%M:%S"),
                end_date=end_dt.strftime("%Y-%m-%d %H:%M:%S"),
                outputsize=DATA_LIMIT_PER_REQUEST
            ).as_pandas()

            api_usage += 1  # Increment usage counter
            return data[data.index >= start_dt]  # Trim overlap
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            t.sleep(min(8 * (attempt + 1), 60))  # Exponential backoff, capped at 60s

    return pd.DataFrame()

def fetch_stock_data(symbol):
    """Fetch full stock data while respecting API rate limits."""
    global api_usage

    market_open = time(9, 30)
    market_close = time(16, 0)
    chunk_days = 14
    overlap = timedelta(minutes=30)

    all_data = []
    current_dt = datetime(2021, 1, 1)
    end_dt = datetime(2024, 12, 31)

    # Find first available trading day
    while current_dt <= end_dt:
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=1)
            continue

        test_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(current_dt, market_close),
            retries=1
        )
        if not test_data.empty:
            break
        current_dt += timedelta(days=1)

    pbar = tqdm(total=(end_dt - current_dt).days, desc=f"Fetching {symbol}")

    while current_dt <= end_dt and api_usage < MAX_DAILY_REQUESTS:
        chunk_end = min(current_dt + timedelta(days=chunk_days), end_dt)

        chunk_data = fetch_chunk(
            symbol,
            datetime.combine(current_dt, market_open),
            datetime.combine(chunk_end, market_close) + overlap,
        )

        if not chunk_data.empty:
            all_data.append(chunk_data)
            pbar.update((chunk_end - current_dt).days)
            current_dt = chunk_data.index[-1].to_pydatetime() - overlap
        else:
            current_dt += timedelta(days=1)

        # Skip weekends efficiently
        if current_dt.weekday() >= 5:
            current_dt += timedelta(days=7 - current_dt.weekday())

        # Ensure we don't exceed per-minute limit
        if api_usage % MAX_MINUTE_REQUESTS == 0:
            t.sleep(61)  # Sleep slightly over 1 min to reset API window

    pbar.close()

    if all_data:
        final_data = pd.concat(all_data)
        final_data = final_data[~final_data.index.duplicated(keep='last')]
        final_data.sort_index(inplace=True)
        save_path = f'/content/drive/My Drive/{symbol}_data.csv'
        final_data.to_csv(save_path)
        print(f"Saved {len(final_data)} records for {symbol} at {save_path}")

# Fetch data for both stocks
fetch_stock_data("AAPL")


## [25 points] Create a program new-stock-price-feeder.py that uses a more modern API (e.g., twelveData) instead.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, lag, when, lit, input_file_name, round
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("StockTradingStrategy") \
    .getOrCreate()

# Read all CSV files from HDFS directory
stock_df = spark.read.csv(
    "hdfs:///user/root/stock_data/",
    header=True,
    inferSchema=True
)

# Add symbol column based on filename
stock_df = stock_df.withColumn(
    "filename", input_file_name()
).withColumn(
    "symbol",
    when(
        col("filename").contains("AAPL"), "AAPL"
    ).when(
        col("filename").contains("MSFT"), "MSFT"
    ).otherwise("UNKNOWN")
).drop("filename")

# Verify we have both symbols
print("Distinct symbols found:")
stock_df.select("symbol").distinct().show()

# Define window specs for moving averages
window_10 = Window.partitionBy("symbol").orderBy("datetime").rowsBetween(-9, 0)
window_40 = Window.partitionBy("symbol").orderBy("datetime").rowsBetween(-39, 0)

# Calculate moving averages
stock_with_ma = stock_df.withColumn("ma_10", avg("close").over(window_10)) \
                       .withColumn("ma_40", avg("close").over(window_40))

# Function to generate trading signals with share calculations
def generate_trades(df):
    window_spec = Window.partitionBy("symbol").orderBy("datetime")
    return df.withColumn("prev_ma_10", lag("ma_10", 1).over(window_spec)) \
            .withColumn("prev_ma_40", lag("ma_40", 1).over(window_spec)) \
            .withColumn("signal",
                when(
                    (col("prev_ma_10") <= col("prev_ma_40")) &
                    (col("ma_10") > col("ma_40")), "buy"
                ).when(
                    (col("prev_ma_10") >= col("prev_ma_40")) &
                    (col("ma_10") < col("ma_40")), "sell"
                ).otherwise(None)
            ) \
            .withColumn("shares", round(lit(100000)/col("close"))) \
            .filter(col("signal").isNotNull()) \
            .select(
                "datetime",
                "symbol",
                "close",
                "signal",
                "shares"
            )

# Generate trades
trades_df = generate_trades(stock_with_ma)

# Format output as requested
formatted_trades = trades_df.rdd.map(
    lambda row: f"({row['datetime']} {row['signal']} {row['symbol']}) - {row['shares']} shares @ ${row['close']:.2f}"
).collect()

# Save results to HDFS
trades_df.write.mode("overwrite") \
    .option("header", "true") \
    .csv("hdfs:///user/root/stock_trades/")

# Print formatted trades
print("\nTrading Recommendations ($100K per trade):")
for trade in formatted_trades:
    print(trade)

# Sample output:
# (2021-01-04 11:30:00 buy AAPL) - 758 shares @ $132.06
# (2021-01-05 14:45:00 sell AAPL) - 742 shares @ $134.75
# (2021-01-04 10:15:00 buy MSFT) - 459 shares @ $217.89
# (2021-01-05 15:30:00 sell MSFT) - 447 shares @ $223.71

 ## [10 pts] Within Spark, filter the incoming date to create aaplPrice and msftPrice streams.

The code is shown above, but the excerpt is pasted below as well. This chunk of code grabs the csv that is stored in hdfs from the earlier steps.

In [None]:
# Read all CSV files from HDFS directory
stock_df = spark.read.csv(
    "hdfs:///user/root/stock_data/",
    header=True,
    inferSchema=True
)

## [10 pts] From aaplPrice produce two other streams aapl10Day, aapl40Day. Both of these streams and their comparison to generate buy/sell signals are not shown in the diagram above.

The code for this is shown above in the entire code chunk. The excerpt is pasted below as well.

In [None]:
window_10 = Window.partitionBy("symbol").orderBy("datetime").rowsBetween(-9, 0)
window_40 = Window.partitionBy("symbol").orderBy("datetime").rowsBetween(-39, 0)

## [10 pts] From msftPrice produce two more streams msft10Day and msft40Day .

 The code is generalized as shown above, so the code for this is the same as for AAPL.

## [20 pts]. Compare the two moving averages (10-day MA and the 40-day MA) to indicate buy and sell signals . Your output should be of the form [( <datetime> buy <symbol>), ( <datetime> sell <symbol>), etc].

Example output from AAPL part of downloaded csv file:

        datetime	symbol	close	signal	shares
        2021-01-04T12:00:00.000Z	AAPL	127.41	sell	785
        2021-01-05T10:00:00.000Z	AAPL	131.4893	buy	761
        2021-01-06T10:15:00.000Z	AAPL	128.9501	sell	775
        2021-01-07T11:30:00.000Z	AAPL	130.705	buy	765
        2021-01-11T10:00:00.000Z	AAPL	129.69	sell	771
        2021-01-13T10:00:00.000Z	AAPL	129.82001	buy	770
        2021-01-14T11:30:00.000Z	AAPL	130.21001	sell	768
        2021-01-19T14:45:00.000Z	AAPL	127.89	buy	782
        2021-01-26T12:15:00.000Z	AAPL	142.405	sell	702
        2021-01-26T13:45:00.000Z	AAPL	142.965	buy	699
        2021-01-27T15:15:00.000Z	AAPL	141.64999	sell	706

Example output from MSFT part of csv file:

        2021-01-05T14:45:00.000Z	MSFT	217.64	buy	459
        2021-01-06T09:45:00.000Z	MSFT	213.2406	sell	469
        2021-01-07T11:00:00.000Z	MSFT	217.64	buy	459
        2021-01-08T13:15:00.000Z	MSFT	217.75999	sell	459
        2021-01-08T15:15:00.000Z	MSFT	219.55	buy	455
        2021-01-11T11:15:00.000Z	MSFT	218.08	sell	459
        2021-01-13T11:45:00.000Z	MSFT	215.95	buy	463
        2021-01-14T11:45:00.000Z	MSFT	214.6358	sell	466
        2021-01-19T10:00:00.000Z	MSFT	214.50999	buy	466
        2021-01-19T10:30:00.000Z	MSFT	213.49001	sell	468
        2021-01-19T10:45:00.000Z	MSFT	213.87	buy	468
        2021-01-19T11:00:00.000Z	MSFT	212.7641	sell	470
        2021-01-19T11:45:00.000Z	MSFT	213.6147	buy	468
        2021-01-22T15:45:00.000Z	MSFT	225.95	sell	443
        2021-01-25T10:30:00.000Z	MSFT	228.48	buy	438
        2021-01-25T10:45:00.000Z	MSFT	226.18401	sell	442
        2021-01-25T14:00:00.000Z	MSFT	228.23	buy	438

As you can see above, the csv file is formatted by datetime, stock symbol, close price, buy/sell signal, and then finally the number of shares, assuming you are buying or selling 100,000 in stocks.