In [None]:
# import modules
import os
from dotenv import load_dotenv
import sqlite3
from sqlite3 import Error
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
import warnings

load_dotenv()
ROOT_DIR = os.getenv('FILES_DIR')

In [None]:
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

In [None]:
# get data
db_file = f'{ROOT_DIR}/ff_ml.db'
conn = sqlite3.connect(db_file)

proj_df_raw = pd.read_sql('SELECT * FROM projections', conn)
rank_df_raw = pd.read_sql('SELECT * FROM ranks', conn)
adp_df_raw = pd.read_sql('SELECT * FROM adps', conn)
stat_df_raw = pd.read_sql('SELECT * FROM stats', conn)

conn.close()

In [None]:
# filter out irrelevant rows of stats
stat_df_raw = stat_df_raw[stat_df_raw['g_played'] >= 13]
stat_df_raw = stat_df_raw[stat_df_raw['pts/g'] >= 5]

In [None]:
# format df's
PROJ_COL_DROP = ['year', 'plyr', 'team']
RANK_COL_DROP = ['year', 'plyr', 'pos', 'team', 'rank', 'pos_rank', 'best', 'worst', 'avg', 'std_dev', 'ecr_adp']
ADP_COL_DROP = ['year', 'plyr', 'pos', 'team', 'pos_rank', 'avg']

proj_df = proj_df_raw.copy()
rank_df = rank_df_raw.copy()
adp_df = adp_df_raw.copy()
stat_df = stat_df_raw[['year-plyr-pos', 'pts/g']].copy()

proj_df.drop(columns=PROJ_COL_DROP, inplace=True)
rank_df.drop(columns=RANK_COL_DROP, inplace=True)
adp_df.drop(columns=ADP_COL_DROP, inplace=True)

rank_df.rename(columns={'pos_rank': 'rank_pos_rank', 'avg': 'rank_avg'}, inplace=True)
adp_df.rename(columns={'pos_rank': 'adp_pos_rank', 'avg': 'adp_avg'}, inplace=True)

In [None]:
# merge prediction-based dfs and fix dtypes
pred_df = proj_df.merge(rank_df, on='year-plyr-pos-team', how='inner')
pred_df = pred_df.merge(adp_df, on='year-plyr-pos-team', how='inner')
pred_df.drop(columns=['year-plyr-pos-team', 'year-plyr-pos_x', 'year-plyr-pos_y'], inplace=True)
for column in list(pred_df.columns):
    try:
        pred_df = pred_df.astype({column: 'float32'})
    except ValueError:
        continue

In [None]:
# merge to stat df to create 1 training df
df_train = stat_df.merge(pred_df, on='year-plyr-pos', how='inner')
df_train.set_index('year-plyr-pos', inplace=True)
print(df_train.head(5))
print(df_train.shape)

In [None]:
# split training df to pos and X and y
qb_X = df_train[df_train['pos'] == 'qb'].copy().drop(columns=['pos', 'pts/g'])
rb_X = df_train[df_train['pos'] == 'rb'].copy().drop(columns=['pos', 'pts/g'])
wr_X = df_train[df_train['pos'] == 'wr'].copy().drop(columns=['pos', 'pts/g'])
te_X = df_train[df_train['pos'] == 'te'].copy().drop(columns=['pos', 'pts/g'])
qb_y = df_train[df_train['pos'] == 'qb']['pts/g'].copy()
rb_y = df_train[df_train['pos'] == 'rb']['pts/g'].copy()
wr_y = df_train[df_train['pos'] == 'wr']['pts/g'].copy()
te_y = df_train[df_train['pos'] == 'te']['pts/g'].copy()

In [None]:
qb_X.shape

In [None]:
# set up dataframe for error summary
df = pd.DataFrame(columns=['method', 'qb_msq', 'qb_r2', 'rb_msq', 'rb_r2', 'wr_msq', 'wr_r2', 'te_msq', 'te_r2'])

from sklearn import linear_model, kernel_ridge, ensemble, svm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# function to fit polynomial data
def fit_me(df, reg, name):
    for each in [1, 2, 3]:
        try:
            print(name + '-' + str(each))
            sc = StandardScaler()
            pca = PCA(n_components=0.99)
            poly_features = PolynomialFeatures(degree=each, include_bias=False)
            poly_reg = Pipeline([
                ('std_scalar', sc),
                ('pca', pca),
                ('poly', poly_features), 
                ('regressor', reg)
            ])
            poly_reg.fit(qb_X_train, qb_y_train)
            qb_y_pred = poly_reg.predict(qb_X_test)
            qb_msq = mean_squared_error(qb_y_test, qb_y_pred)
            qb_r2 = r2_score(qb_y_test, qb_y_pred)
            poly_reg.fit(rb_X_train, rb_y_train)
            rb_y_pred = poly_reg.predict(rb_X_test)
            rb_msq = mean_squared_error(rb_y_test, rb_y_pred)
            rb_r2 = r2_score(rb_y_test, rb_y_pred)
            poly_reg.fit(wr_X_train, wr_y_train)
            wr_y_pred = poly_reg.predict(wr_X_test)
            wr_msq = mean_squared_error(wr_y_test, wr_y_pred)
            wr_r2 = r2_score(wr_y_test, wr_y_pred)
            poly_reg.fit(te_X_train, te_y_train)
            te_y_pred = poly_reg.predict(te_X_test)
            te_msq = mean_squared_error(te_y_test, te_y_pred)
            te_r2 = r2_score(te_y_test, te_y_pred)
            df = pd.concat([df, pd.DataFrame([[name+'-'+str(each), qb_msq, qb_r2, rb_msq, rb_r2, wr_msq, wr_r2, te_msq, te_r2]], columns=['method', 'qb_msq', 'qb_r2', 'rb_msq', 'rb_r2', 'wr_msq', 'wr_r2', 'te_msq', 'te_r2'])])
        except RuntimeError:
            continue
    return df

def fit_me_other(df, reg, name):
    print(name)
    reg = reg
    reg.fit(qb_X_train, qb_y_train)
    qb_y_pred = reg.predict(qb_X_test)
    qb_msq = mean_squared_error(qb_y_test, qb_y_pred)
    qb_r2 = r2_score(qb_y_test, qb_y_pred)
    reg.fit(rb_X_train, rb_y_train)
    rb_y_pred = reg.predict(rb_X_test)
    rb_msq = mean_squared_error(rb_y_test, rb_y_pred)
    rb_r2 = r2_score(rb_y_test, rb_y_pred)
    reg.fit(wr_X_train, wr_y_train)
    wr_y_pred = reg.predict(wr_X_test)
    wr_msq = mean_squared_error(wr_y_test, wr_y_pred)
    wr_r2 = r2_score(wr_y_test, wr_y_pred)
    reg.fit(te_X_train, te_y_train)
    te_y_pred = reg.predict(te_X_test)
    te_msq = mean_squared_error(te_y_test, te_y_pred)
    te_r2 = r2_score(te_y_test, te_y_pred)
    df = pd.concat([df, pd.DataFrame([[name, qb_msq, qb_r2, rb_msq, rb_r2, wr_msq, wr_r2, te_msq, te_r2]], columns=['method', 'qb_msq', 'qb_r2', 'rb_msq', 'rb_r2', 'wr_msq', 'wr_r2', 'te_msq', 'te_r2'])])
    return df

# multiple iterations with different data splits
for idx, each in enumerate(range(10)):
    print('-'*20)
    print(idx)
    print('-'*20)
    qb_X_train, qb_X_test, qb_y_train, qb_y_test = train_test_split(qb_X, qb_y, test_size=0.3)
    rb_X_train, rb_X_test, rb_y_train, rb_y_test = train_test_split(rb_X, rb_y, test_size=0.3)
    wr_X_train, wr_X_test, wr_y_train, wr_y_test = train_test_split(wr_X, wr_y, test_size=0.3)
    te_X_train, te_X_test, te_y_train, te_y_test = train_test_split(te_X, te_y, test_size=0.3)

    reg = linear_model.LinearRegression()
    df = fit_me(df, reg, 'LinearRegression')
    reg = linear_model.LinearRegression(positive=True)
    df = fit_me(df, reg, 'LinearRegression | Positive')

    reg = linear_model.Ridge(alpha=0.25)
    df = fit_me(df, reg, 'Ridge | Alpha=0.25')
    reg = linear_model.Ridge(alpha=0.5)
    df = fit_me(df, reg, 'Ridge | Alpha=0.5')
    reg = linear_model.Ridge(alpha=0.75)
    df = fit_me(df, reg, 'Ridge | Alpha=0.75')
    reg = linear_model.Ridge(alpha=1)
    df = fit_me(df, reg, 'Ridge | Alpha=1')

    reg = linear_model.Ridge(alpha=0.25, solver='lbfgs', positive=True)
    df = fit_me(df, reg, 'Ridge | Alpha=0.25 | Positive')
    reg = linear_model.Ridge(alpha=0.5, solver='lbfgs', positive=True)
    df = fit_me(df, reg, 'Ridge | Alpha=0.5 | Positive')
    reg = linear_model.Ridge(alpha=0.75, solver='lbfgs', positive=True)
    df = fit_me(df, reg, 'Ridge | Alpha=0.75 | Positive')
    reg = linear_model.Ridge(alpha=1, solver='lbfgs', positive=True)
    df = fit_me(df, reg, 'Ridge | Alpha=1 | Positive')

    reg = linear_model.Lasso(alpha=0.25)
    df = fit_me(df, reg, 'Lasso | Alpha=0.25')
    reg = linear_model.Lasso(alpha=0.5)
    df = fit_me(df, reg, 'Lasso | Alpha=0.5')
    reg = linear_model.Lasso(alpha=0.75)
    df = fit_me(df, reg, 'Lasso | Alpha=0.75')
    reg = linear_model.Lasso(alpha=1)
    df = fit_me(df, reg, 'Lasso | Alpha=1')

    reg = linear_model.Lasso(alpha=0.25, positive=True)
    df = fit_me(df, reg, 'Lasso | Alpha=0.25 | Positive')
    reg = linear_model.Lasso(alpha=0.5, positive=True)
    df = fit_me(df, reg, 'Lasso | Alpha=0.5 | Positive')
    reg = linear_model.Lasso(alpha=0.75, positive=True)
    df = fit_me(df, reg, 'Lasso | Alpha=0.75 | Positive')
    reg = linear_model.Lasso(alpha=1, positive=True)
    df = fit_me(df, reg, 'Lasso | Alpha=1 | Positive')

    reg = linear_model.LassoLars(alpha=0.25)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.25')
    reg = linear_model.LassoLars(alpha=0.5)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.5')
    reg = linear_model.LassoLars(alpha=0.75)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.75')
    reg = linear_model.LassoLars(alpha=1)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=1')

    reg = linear_model.LassoLars(alpha=0.25, positive=True)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.25 | Positive')
    reg = linear_model.LassoLars(alpha=0.5, positive=True)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.5 | Positive')
    reg = linear_model.LassoLars(alpha=0.75, positive=True)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=0.75 | Positive')
    reg = linear_model.LassoLars(alpha=1, positive=True)
    df = fit_me(df, reg, 'LARS Lasso | Alpha=1 | Positive')

    reg = linear_model.BayesianRidge()
    df = fit_me(df, reg, 'Bayesian Ridge Regression')

    reg = linear_model.ElasticNet(alpha=0.25)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.25')
    reg = linear_model.ElasticNet(alpha=0.5)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.5')
    reg = linear_model.ElasticNet(alpha=0.75)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.75')
    reg = linear_model.ElasticNet(alpha=1)
    df = fit_me(df, reg, 'Elastic Net | Alpha=1')

    reg = linear_model.ElasticNet(alpha=0.25, positive=True)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.25 | Positive')
    reg = linear_model.ElasticNet(alpha=0.5, positive=True)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.5 | Positive')
    reg = linear_model.ElasticNet(alpha=0.75, positive=True)
    df = fit_me(df, reg, 'Elastic Net | Alpha=0.75 | Positive')
    reg = linear_model.ElasticNet(alpha=1, positive=True)
    df = fit_me(df, reg, 'Elastic Net | Alpha=1 | Positive')

    reg = ensemble.GradientBoostingRegressor()
    df = fit_me_other(df, reg, 'Gradient Boosting Regressor')

    if idx == 0:
        final_df = df
    else:
        final_df = pd.concat((final_df, df))

# clean and sort data for anaylsis
df = final_df.groupby('method').mean()
# df = df[(df['qb_r2']>0) & (df['rb_r2']>0) & (df['wr_r2']>0) & (df['te_r2']>0)]
df['avg_msq'] = (df['qb_msq'] + df['rb_msq'] + df['wr_msq'] + df['te_msq'])/4
df.sort_values(by=['avg_msq'], ascending=True, inplace=True)

In [None]:
df

In [None]:
df.to_csv(f'{ROOT_DIR}/analysis.csv')