# Milestone 2 - Feature Engineering, Selection & Modeling

This notebook analyzes the impact of Formula One race victories on sponsor stock prices using event study methodology and machine learning models.

In [None]:
import fastf1
import yfinance as yf
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Step 1: Get Race Winner Data

In [None]:
def get_race_winner_data(year):
    fastf1.Cache.enable_cache('f1_cache')
    schedule = fastf1.get_event_schedule(year)
    races = []

    for _, race in schedule.iterrows():
        try:
            session = fastf1.get_session(year, race['EventName'], 'R')
            session.load()
            if session.results is not None:
                winner = session.results.iloc[0]
                races.append({
                    'date': session.date.date(),
                    'race_name': race['EventName'],
                    'winner': winner.FullName,
                    'team': winner.TeamName
                })
            else:
                print(f"No results for {race['EventName']}")
        except Exception as e:
            print(f"Failed to load {race['EventName']}: {e}")

    return pd.DataFrame(races)

# Example for year 2023
race_df = get_race_winner_data(2023)
race_df.head()

## Step 2: Get Sponsor Stock Data

In [None]:
from alpha_vantage.timeseries import TimeSeries

# Replace with your actual Alpha Vantage API key
ALPHA_VANTAGE_API_KEY = '0GXUK17CQSZWQU1M'

ts = TimeSeries(key=ALPHA_VANTAGE_API_KEY, output_format='pandas')

def get_stock_data(ticker, start, end):
    try:
        data, meta = ts.get_daily_adjusted(symbol=ticker, outputsize='full')
        data.index = pd.to_datetime(data.index)
        data = data.sort_index()
        data = data.loc[(data.index >= pd.to_datetime(start)) & (data.index <= pd.to_datetime(end))]
        data['Return'] = data['5. adjusted close'].pct_change()
        return data[['5. adjusted close', 'Return']].rename(columns={'5. adjusted close': 'Adj Close'})
    except Exception as e:
        print(f"[!] Failed to fetch stock data for {ticker}: {e}")
        return pd.DataFrame(columns=['Adj Close', 'Return'])


## Step 3: Join Race and Stock Data + Engineer Features

In [None]:
def get_stock_window(race_date, ticker):
    start = race_date - datetime.timedelta(days=7)
    end = race_date + datetime.timedelta(days=7)
    return get_stock_data(ticker, start, end)

def engineer_features(race_df, sponsor_ticker):
    data = []
    stock_cache = {}  # Cache to avoid repeated downloads

    for _, row in race_df.iterrows():
        race_date = row['date']
        cache_key = (sponsor_ticker, race_date)

        if cache_key in stock_cache:
            stock = stock_cache[cache_key]
        else:
            stock = get_stock_window(race_date, sponsor_ticker)
            stock_cache[cache_key] = stock

        if stock.empty:
            print(f"[!] No stock data for race on {race_date}")
            continue

        try:
            before = stock.loc[stock.index < race_date].iloc[-1]['Adj Close']
            after = stock.loc[stock.index > race_date].iloc[0]['Adj Close']
            pct_change = (after - before) / before
        except Exception as e:
            print(f"[!] Error processing stock data for {race_date}: {e}")
            pct_change = None

        data.append({
            **row,
            'price_change_pct': pct_change,
        })

    return pd.DataFrame(data)



feature_df = engineer_features(race_df, "SHEL")  # Replace with actual sponsor ticker
feature_df.head()

## Step 4: Feature Selection with Correlation

In [None]:
corr_matrix = feature_df.drop(columns=['date', 'race_name', 'winner', 'team']).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

## Step 5: Data Modeling with Random Forest

In [None]:
feature_df['label'] = (feature_df['price_change_pct'] > 0).astype(int)
X = feature_df.drop(columns=['date', 'race_name', 'winner', 'team', 'price_change_pct', 'label'])
X = pd.get_dummies(X, drop_first=True)
y = feature_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))