<a href="https://colab.research.google.com/github/anushka1511/S-P-500-Prediction-Using-ML-Techniques/blob/main/s%26p_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. SETUP AND LIBRARIES
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from tqdm import tqdm

# Machine Learning Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# Performance Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Function for SMAPE
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100


# DATA ACQUISITION: S&P 500 COMPANY DATA
print("Fetching S&P 500 tickers...")
try:
    sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp500_table = pd.read_html(sp500_url)
    sp500_tickers = sp500_table[0]['Symbol'].tolist()
    print(f"Found {len(sp500_tickers)} tickers.")
except Exception as e:
    print(f"Could not fetch tickers from Wikipedia, using a fallback list. Error: {e}")
    # Fallback list
    sp500_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'TSLA', 'META', 'BRK-B', 'JNJ', 'V']

feature_keys = [
    'marketCap', 'enterpriseValue', 'trailingPE', 'forwardPE',
    'ebitda', 'totalRevenue', 'grossMargins', 'operatingMargins',
    'totalDebt', 'totalCash', 'bookValue', 'sector', 'fullTimeEmployees'
]

company_data = []
print(f"\nFetching financial data for {len(sp500_tickers)} companies...")
for ticker in tqdm(sp500_tickers, desc="Processing tickers"):
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        # Dictionary for the current company's data
        data_point = {'ticker': ticker}
        for key in feature_keys:
            data_point[key] = info.get(key, np.nan)

        company_data.append(data_point)
    except Exception as e:
        print(f"Could not fetch data for {ticker}: {e}")

# Create DataFrame
df = pd.DataFrame(company_data)
print("\nData acquisition complete.")
print(df.head())

# DATA PREPROCESSING AND FEATURE ENGINEERING
print("\nStarting data preprocessing...")

# The target is marketCap
df.dropna(subset=['marketCap'], inplace=True)
df = df[df['marketCap'] > 0] # Market cap must be positive

y = df['marketCap']
X = df.drop(columns=['marketCap', 'ticker']) # Drop target and identifier

categorical_features = ['sector']
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Preprocessing Pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split Data into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Preprocessing complete.")

# MODEL TRAINING AND EVALUATION
print("\nTraining and evaluating models...")

models = {
    "Support Vector Machine (SVR)": SVR(C=1.0, epsilon=0.2), # C is the regularization parameter
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=5, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, subsample=0.8, random_state=42),
}

# Stacking Ensemble Model: directly tests the hypothesis that "ensemble methods provide superior predictive accuracy".
base_estimators = [
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
    ('svr', SVR(C=1.0))
]
stacking_model = StackingRegressor(
    estimators=base_estimators,
    final_estimator=LinearRegression()
)
models["Stacking Ensemble"] = stacking_model


# Train, Predict, and Evaluate
results = {}

for name, model in models.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', model)])

    model_pipeline.fit(X_train, y_train)

    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    smp = smape(y_test, y_pred)

    # Results
    results[name] = {
        'R-squared (R2)': r2,
        'Mean Absolute Error (MAE)': mae,
        'Root Mean Squared Error (RMSE)': rmse,
        'Symmetric MAPE (SMAPE %)': smp
    }
    print(f"--- {name} ---")
    print(f"  R2: {r2:.4f}")
    print(f"  MAE: ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  SMAPE: {smp:.2f}%")

print("\n--- Final Performance Comparison ---")
results_df = pd.DataFrame(results).T
results_df['Mean Absolute Error (MAE)'] = results_df['Mean Absolute Error (MAE)'].apply(lambda x: f"${x:,.0f}")
results_df['Root Mean Squared Error (RMSE)'] = results_df['Root Mean Squared Error (RMSE)'].apply(lambda x: f"${x:,.0f}")
print(results_df)

print("\n--- Feature Interpretability (from Random Forest) ---")
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', models["Random Forest"])])
rf_pipeline.fit(X_train, y_train)

# Feature importances
importances = rf_pipeline.named_steps['regressor'].feature_importances_

ohe_feature_names = rf_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(ohe_feature_names)
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance_df.head(10))

Fetching S&P 500 tickers...
Found 503 tickers.

Fetching financial data for 503 companies...


Processing tickers: 100%|██████████| 503/503 [01:38<00:00,  5.12it/s]



Data acquisition complete.
  ticker     marketCap  enterpriseValue  trailingPE  forwardPE        ebitda  \
0    MMM  8.181427e+10     8.898333e+10   18.931507  19.243038  5.328000e+09   
1    AOS  9.329455e+09     9.425059e+09   18.337990  16.130220  7.689000e+08   
2    ABT  2.337997e+11     2.404982e+11   17.451948  26.042637  1.102100e+10   
3   ABBV  3.220324e+11     3.869229e+11   78.244640  15.029678  2.736600e+10   
4    ACN  1.840281e+11     1.835507e+11   23.523884  20.999289  1.176778e+10   

   totalRevenue  grossMargins  operatingMargins     totalDebt     totalCash  \
0  2.451300e+10       0.41211           0.20675  1.413400e+10  7.024000e+09   
1  3.803200e+09       0.38042           0.18965  2.958000e+08  2.002000e+08   
2  4.234400e+10       0.55989           0.18092  1.329000e+10  6.844000e+09   
3  5.736700e+10       0.71015           0.30428  7.002400e+10  5.176000e+09   
4  6.848254e+10       0.32069           0.16825  8.165258e+09  9.637395e+09   

   bookValue    