In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [None]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

In [45]:
# Load the data
interpolated_dir = os.path.join(parent_dir, "dataset_interpolated")
def load_data(interpolated_dir):
    data = {}
    for folder in os.listdir(interpolated_dir):
        if folder != "2019":
            continue
        folder_path = os.path.join(interpolated_dir, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".csv"):
                    file_path = os.path.join(folder_path, file)
                    df = pd.read_csv(file_path)
                    for _, row in df.iloc[1:].iterrows():
                        row["relative_strength"] = df.iloc[0]["homeWinProbability"]
                        row["away_team_id"] = df.iloc[0]["away_team_id"]
                        row["home_team_id"] = df.iloc[0]["home_team_id"]
                        row["home_win"] = df.iloc[0]["home_win"]
                        if row["timestep"] not in data:
                            data[row["timestep"]] = [row] 
                        else:
                            data[row["timestep"]] += [row]
    return data

data = load_data(interpolated_dir)

In [None]:
features = ["home_win", "relative_strength", "homeScore", "awayScore", "scoringPlay", "start.down", "start.distance", "start.yardLine", "end.down", "end.distance", "end.yardLine"]
for timestep in data.keys():
    # Find rows that have NaN values for my array of features
    rows = data[timestep]
    # Get all rows such that one of the features is NaN
    rows_with_nan = [row for row in rows if any(np.isnan(row[feature]) for feature in features)]
    # Print row with NaN values + timestep
    if len(rows_with_nan) > 0:
        print("Timestep:", timestep)
        for row in rows_with_nan:
            print(row)

In [None]:
# Feature Selection
def feature_selection(data, features):
    # Given the features of the data, return data such that each row is an array of the values of the features
    # The data is a dictionary where the key is the timestep and the value is a list of rows
    feature_data = {}
    for timestep in data:
        feature_data[timestep] = []
        for row in data[timestep]:
            new_row = [[float(row[feature]) for feature in features]]
            # First check if the row has any NaN values
            if any(np.isnan(new_row[0])):
                continue
            feature_data[timestep] += new_row
    return feature_data

features_data = feature_selection(data, features)

In [None]:
# Setup model for each timestep
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create an array of logistic regression models
models = {}
def setup_models(features_data):
    for timestep in features_data:
        X = np.array(features_data[timestep])[:,1:]
        y = np.array(features_data[timestep])[:,0]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = LogisticRegression(max_iter=1000)
        print("Training for timestep", timestep)
        model.fit(X_train, y_train)
        models[timestep] = model
setup_models(features_data)
models

[0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1.
 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.
 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0.
 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0.
 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Timestep: 0.0
home_team_id                   33.0
away_team_id                    4.0
home_win                        1.0
sequenceNumber                  100
homeWinProbability             0.81
homeScore                       NaN
scoringPlay                     NaN
priority                        NaN
statYardage                     NaN
awayScore                       NaN
wallclock                       NaN
modified                        NaN
id                              NaN
text                            NaN
period.number                   NaN
scoringType.displayName         NaN
scoringType.name                NaN
scoringType.abbreviation        NaN
start.distance                  NaN
start.yardLine                  NaN
start.team.id                   NaN
start.down                      NaN
start.yardsToEndzone            NaN
clock.displayValue              NaN
type.id                         NaN
type.text                       NaN
type.abbreviation               NaN
end.distance  