# Milestone 2 - Feature Engineering, Selection & Modeling

This notebook analyzes the impact of Formula One race victories on sponsor stock prices using event study methodology and machine learning models.

In [None]:
import fastf1
import yfinance as yf
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Step 1: Get Race Winner Data

In [None]:
def get_race_winner_data(year):
    fastf1.Cache.enable_cache('f1_cache')
    schedule = fastf1.get_event_schedule(year)
    races = []
    
    for _, race in schedule.iterrows():
        try:
            event = fastf1.get_session(year, race['EventName'], 'R')
            session = event.load()
            winner = session.results.iloc[0]
            races.append({
                'date': race['EventDate'].date(),
                'race_name': race['EventName'],
                'winner': winner.FullName,
                'team': winner.TeamName
            })
        except Exception as e:
            print(f"Failed to load {race['EventName']}: {e}")
    return pd.DataFrame(races)

# Example for 2023
race_df = get_race_winner_data(2023)
race_df.head()

## Step 2: Get Sponsor Stock Data

In [None]:
def get_stock_data(ticker, start, end):
    stock = yf.download(ticker, start=start, end=end)
    stock['Return'] = stock['Adj Close'].pct_change()
    return stock[['Adj Close', 'Return']]

## Step 3: Join Race and Stock Data + Engineer Features

In [None]:
def get_stock_window(race_date, ticker):
    start = race_date - datetime.timedelta(days=7)
    end = race_date + datetime.timedelta(days=7)
    return get_stock_data(ticker, start, end)

def engineer_features(race_df, sponsor_ticker):
    data = []
    for _, row in race_df.iterrows():
        stock = get_stock_window(row['date'], sponsor_ticker)
        event_day = row['date']
        try:
            before = stock.loc[stock.index < event_day].iloc[-1]['Adj Close']
            after = stock.loc[stock.index > event_day].iloc[0]['Adj Close']
            pct_change = (after - before) / before
        except:
            pct_change = None

        data.append({
            **row,
            'price_change_pct': pct_change,
        })
    return pd.DataFrame(data)

feature_df = engineer_features(race_df, "SHEL")
feature_df.head()

## Step 4: Feature Selection with Correlation

In [None]:
corr_matrix = feature_df.drop(columns=['date', 'race_name', 'winner', 'team']).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

## Step 5: Data Modeling with Random Forest

In [None]:
feature_df['label'] = (feature_df['price_change_pct'] > 0).astype(int)
X = feature_df.drop(columns=['date', 'race_name', 'winner', 'team', 'price_change_pct', 'label'])
X = pd.get_dummies(X, drop_first=True)
y = feature_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))