In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [None]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

In [65]:
# Load the data
interpolated_dir = os.path.join(parent_dir, "dataset_interpolated")
def load_data(interpolated_dir):
    data = {}
    for folder in os.listdir(interpolated_dir):
        folder_path = os.path.join(interpolated_dir, folder)
        print(f"Loading data for {folder}")
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".csv"):
                    file_path = os.path.join(folder_path, file)
                    df = pd.read_csv(file_path)
                    for _, row in df.iloc[1:].iterrows():
                        row["relative_strength"] = df.iloc[0]["homeWinProbability"]
                        row["away_team_id"] = df.iloc[0]["away_team_id"]
                        row["home_team_id"] = df.iloc[0]["home_team_id"]
                        row["home_win"] = df.iloc[0]["home_win"]
                        if row["timestep"] not in data:
                            data[row["timestep"]] = [row] 
                        else:
                            data[row["timestep"]] += [row]
        print(f"Loaded data for {folder}")
    return data

data = load_data(interpolated_dir)

Loading data for 2022
Loaded data for 2022
Loading data for 2023
Loaded data for 2023
Loading data for .DS_Store
Loaded data for .DS_Store
Loading data for 2019
Loaded data for 2019
Loading data for 2021
Loaded data for 2021
Loading data for 2020
Loaded data for 2020


In [66]:
# Get information about the data
for timestep in data:
    print(f"timestep: {timestep}, number of instances: {len(data[timestep])}")

timestep: 0.0, number of instances: 2096
timestep: 0.005, number of instances: 1512
timestep: 0.01, number of instances: 1377
timestep: 0.015, number of instances: 1519
timestep: 0.02, number of instances: 1439
timestep: 0.025, number of instances: 1544
timestep: 0.03, number of instances: 1530
timestep: 0.035, number of instances: 1523
timestep: 0.04, number of instances: 1548
timestep: 0.045, number of instances: 1522
timestep: 0.05, number of instances: 1572
timestep: 0.055, number of instances: 1569
timestep: 0.06, number of instances: 1578
timestep: 0.065, number of instances: 1603
timestep: 0.07, number of instances: 1577
timestep: 0.075, number of instances: 1611
timestep: 0.08, number of instances: 1616
timestep: 0.085, number of instances: 1573
timestep: 0.09, number of instances: 1627
timestep: 0.095, number of instances: 1621
timestep: 0.1, number of instances: 1642
timestep: 0.105, number of instances: 1606
timestep: 0.11, number of instances: 1633
timestep: 0.115, number o

In [None]:
features = ["home_win", "relative_strength", "homeScore", "awayScore", "scoringPlay", "start.down", "start.distance", "start.yardLine", "end.down", "end.distance", "end.yardLine"]
for timestep in data.keys():
    # Find rows that have NaN values for my array of features
    rows = data[timestep]
    # Get all rows such that one of the features is NaN
    rows_with_nan = [row for row in rows if any(np.isnan(row[feature]) for feature in features)]
    # Print row with NaN values + timestep
    if len(rows_with_nan) > 0:
        print("Timestep:", timestep)
        for row in rows_with_nan:
            print(row)

In [67]:
# Feature Selection
def feature_selection(data, features):
    # Given the features of the data, return data such that each row is an array of the values of the features
    # The data is a dictionary where the key is the timestep and the value is a list of rows
    feature_data = {}
    for timestep in data:
        feature_data[timestep] = []
        for row in data[timestep]:
            new_row = [[float(row[feature]) for feature in features]]
            # First check if the row has any NaN values
            if any(np.isnan(new_row[0])):
                continue
            feature_data[timestep] += new_row
    return feature_data

features_data = feature_selection(data, features)

In [68]:
# Setup model for each timestep
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create an array of logistic regression models
models = {}
def setup_models(features_data):
    X_tests = {}
    y_tests = {}
    for timestep in features_data:
        X = np.array(features_data[timestep])[:,1:]
        y = np.array(features_data[timestep])[:,0]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_tests[timestep] = X_test
        y_tests[timestep] = y_test
        model = LogisticRegression(max_iter=1000)
        print("Training for timestep", timestep)
        model.fit(X_train, y_train)
        models[timestep] = model
    return X_tests, y_tests
X_tests, y_tests = setup_models(features_data)


Training for timestep 0.0
Training for timestep 0.005
Training for timestep 0.01
Training for timestep 0.015
Training for timestep 0.02
Training for timestep 0.025
Training for timestep 0.03
Training for timestep 0.035
Training for timestep 0.04
Training for timestep 0.045
Training for timestep 0.05
Training for timestep 0.055
Training for timestep 0.06
Training for timestep 0.065
Training for timestep 0.07
Training for timestep 0.075
Training for timestep 0.08
Training for timestep 0.085
Training for timestep 0.09
Training for timestep 0.095
Training for timestep 0.1
Training for timestep 0.105
Training for timestep 0.11
Training for timestep 0.115
Training for timestep 0.12
Training for timestep 0.125
Training for timestep 0.13
Training for timestep 0.135
Training for timestep 0.14
Training for timestep 0.145
Training for timestep 0.15
Training for timestep 0.155
Training for timestep 0.16
Training for timestep 0.165
Training for timestep 0.17
Training for timestep 0.175
Training for

In [None]:
# Test accuracy of model for each timestep on test data and plot
accuracies = []
timesteps = []
for timestep in models:
    model = models[timestep]
    X_test = X_tests[timestep]
    y_test = y_tests[timestep]
    accuracy = model.score(X_test, y_test)
    accuracies.append(accuracy)
    timesteps.append(timestep)

plt.plot(timesteps, accuracies)
plt.xlabel("Timestep")
plt.ylabel("Accuracy")
plt.title("Accuracy of Logistic Regression Model for Each Timestep")
plt.show()

Timestep: 0.0
home_team_id                   33.0
away_team_id                    4.0
home_win                        1.0
sequenceNumber                  100
homeWinProbability             0.81
homeScore                       NaN
scoringPlay                     NaN
priority                        NaN
statYardage                     NaN
awayScore                       NaN
wallclock                       NaN
modified                        NaN
id                              NaN
text                            NaN
period.number                   NaN
scoringType.displayName         NaN
scoringType.name                NaN
scoringType.abbreviation        NaN
start.distance                  NaN
start.yardLine                  NaN
start.team.id                   NaN
start.down                      NaN
start.yardsToEndzone            NaN
clock.displayValue              NaN
type.id                         NaN
type.text                       NaN
type.abbreviation               NaN
end.distance  