# Final Project

Alan Wang

In [2]:
#pip install yfinance pandas numpy ta scikit-learn matplotlib

In [3]:
import random
random.seed(5040)

import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Download data (suppress warnings and force 1D output)
ticker = "SPY"
data = yf.download(ticker, start="2015-01-01", end="2023-12-31", progress=False)

# SAFE data preparation
def prepare_data(df):
    """Convert columns to guaranteed 1D arrays"""
    close = df['Close'].values.ravel()  # Force 1D
    volume = df['Volume'].values.ravel()
    return pd.DataFrame({
        'Close': close,
        'Volume': volume
    }, index=df.index)

clean_data = prepare_data(data)

# Custom indicator functions
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def calculate_macd(series, fast=12, slow=26, signal=9):
    ema_fast = series.ewm(span=fast).mean()
    ema_slow = series.ewm(span=slow).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal).mean()
    return macd_line - signal_line

# Calculate returns and indicators MANUALLY
clean_data['Return'] = clean_data['Close'].pct_change().shift(-1)
clean_data['SMA_10'] = clean_data['Close'].rolling(10).mean()
clean_data['SMA_50'] = clean_data['Close'].rolling(50).mean()
clean_data['RSI'] = calculate_rsi(clean_data['Close']) 
clean_data['MACD'] = calculate_macd(clean_data['Close'])  
clean_data['Volume_MA'] = clean_data['Volume'].rolling(5).mean()

# Final cleanup
clean_data.dropna(inplace=True)

# Prepare features/target
features = ['SMA_10', 'SMA_50', 'RSI', 'MACD', 'Volume_MA']
X = clean_data[features]
y = np.where(clean_data['Return'] > 0, 1, 0)

# Standardize and split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
split_idx = int(0.8 * len(X))
X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print("Success! Data shape:", X_scaled.shape)

Success! Data shape: (2214, 5)


  data = yf.download(ticker, start="2015-01-01", end="2023-12-31", progress=False)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.show()

# Feature Importance (Random Forest)
if "Random Forest" in models:
    importances = models["Random Forest"].feature_importances_
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title("Random Forest Feature Importance")
    plt.show()