In [None]:
from ml_pipe.week_data_prep import generate_input_data
import pandas as pd
import numpy as np
import os

week_df_fp = 'assets/week_train.csv'
week_Xy_fp = 'assets/train_xy.csv'
week_fp = 'nfl-big-data-bowl-2021/week{}.csv'
plays_fp = 'nfl-big-data-bowl-2021/plays.csv'
if not os.path.exists(week_df_fp):
    week_df_full = pd.DataFrame()
    X_y_full = pd.DataFrame()
    for week in range(1, 3):
        week_df, X_y = generate_input_data(week_fp.format(week), plays_fp)
        week_df_full = pd.concat([week_df_full, week_df], axis=0)
        X_y_full = pd.concat([X_y_full, X_y], axis=0)
    week_df_full.to_csv(week_df_fp, index=False)
    X_y_full.to_csv(week_Xy_fp, index=False)
else:
    week_df_full = pd.read_csv(week_df_fp)
    X_y_full = pd.read_csv(week_Xy_fp)

In [15]:
X = X_y_full.drop(['x', 'y'], axis=1)
y = X_y_full[['x', 'y']]

In [16]:
X.shape

(889900, 71)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

impute = SimpleImputer(strategy="constant", fill_value=0)
scaler = StandardScaler()
onehot = OneHotEncoder(handle_unknown='ignore')

cat_cols = X_train.columns[X_train.dtypes == 'object']
num_cols = X_train.columns[X_train.dtypes != 'object']

cat_pipe = Pipeline([('one_hot', onehot)])
num_pipe = Pipeline([('impute', impute), ('scaler', scaler)])

full_pipe = ColumnTransformer([('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols)])

In [19]:
X_train_tran = full_pipe.fit_transform(X_train)

In [61]:
from sklearn.linear_model import LinearRegression

models = []
X_vals_rand = []
y_vals_rand = []
X_trains_rand = []
y_trains_rand = []
for i in range(10):
    np.random.seed(i)
    idx_rand = np.random.choice(range(X_train_tran.shape[0]), int(X_train_tran.shape[0] * 0.8))
    X_train_rand = X_train_tran[idx_rand]
    y_train_rand = y_train.iloc[idx_rand]
    idx_val = np.delete(range(X_train_tran.shape[0]), idx_rand)
    X_val_rand = X_train_tran[idx_val]
    y_val_rand = y_train.iloc[idx_val]
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_rand, y_train_rand)
    models.append(lin_reg)
    X_trains_rand.append(X_train_rand)
    y_trains_rand.append(y_train_rand)
    X_vals_rand.append(X_val_rand)
    y_vals_rand.append(y_val_rand)

In [69]:
from sklearn.metrics import r2_score

y_preds_val = [model.predict(X_val) for model, X_val in zip(models, X_vals_rand)]
r2_scores_val = [r2_score(y_val, y_pred) for y_val, y_pred in zip(y_vals_rand, y_preds_val)]

In [70]:
X_test_tran = full_pipe.transform(X_test)

In [71]:
y_preds_test = [model.predict(X_test_tran) for model in models]
r2_scores_test = [r2_score(y_test, y_pred) for y_pred in y_preds_test]

In [119]:
chosen_cols = np.array(X_y_full.columns)
np.save('assets/chosen_cols.npy', chosen_cols)

week4, X_y4 = generate_input_data(week_fp.format(4), plays_fp, chosen_col_fp='assets/chosen_cols.npy')

KeyboardInterrupt: 

In [118]:
 chosen_cols

array(['x', 'y', 'position_num', 'x_starting_behind_line', 'y_starting',
       'yardsToGo', 'yards_needed_touch', 'time_acc_s', 'yardline_first',
       'x_starting_behind_line_CB1', 'x_starting_behind_line_CB2',
       'x_starting_behind_line_CB3', 'x_starting_behind_line_CB4',
       'x_starting_behind_line_DB1', 'x_starting_behind_line_DE1',
       'x_starting_behind_line_DL1', 'x_starting_behind_line_FS1',
       'x_starting_behind_line_FS2', 'x_starting_behind_line_FS3',
       'x_starting_behind_line_IL1', 'x_starting_behind_line_IL3',
       'x_starting_behind_line_IL4', 'x_starting_behind_line_LB1',
       'x_starting_behind_line_LB2', 'x_starting_behind_line_ML1',
       'x_starting_behind_line_ML3', 'x_starting_behind_line_NT1',
       'x_starting_behind_line_OL1', 'x_starting_behind_line_OL3',
       'x_starting_behind_line_OL4', 'x_starting_behind_line_OL6',
       'x_starting_behind_line_OL7', 'x_starting_behind_line_S0',
       'x_starting_behind_line_SS1', 'x_starting_b