In [None]:
# import modules
import os
from dotenv import load_dotenv
import sqlite3
from sqlite3 import Error
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

load_dotenv()
ROOT_DIR = os.getenv('FILES_DIR')

In [None]:
pd.set_option('display.width', 1000)

In [None]:
# constants
TEAMS = 12
ROSTER_SIZE = 16
STARTERS_NO_DEF_K = 1 + 2 + 2 + 1 + 1
VBD_BASELINE = TEAMS * STARTERS_NO_DEF_K + TEAMS
BUDGET = 200
TOTAL_MONEY = (BUDGET - ROSTER_SIZE) * TEAMS
RANK_TYPE_COLUMNS = [
    'rank', 'rank_pos_rank', 'best', 'worst',
    'rank_avg', 'adp_pos_rank', 'adp_avg'
]

In [None]:
# get data
db_file = f'{ROOT_DIR}/ff_ml.db'
conn = sqlite3.connect(db_file)

proj_df_raw = pd.read_sql('SELECT * FROM projections', conn)
rank_df_raw = pd.read_sql('SELECT * FROM ranks', conn)
adp_df_raw = pd.read_sql('SELECT * FROM adps', conn)
stat_df_raw = pd.read_sql('SELECT * FROM stats', conn)

conn.close()

In [None]:
# filter out irrelevant rows of stats
stat_df_raw = stat_df_raw[stat_df_raw['g_played'] >= 13]
stat_df_raw = stat_df_raw[stat_df_raw['pts/g'] >= 5]

In [None]:
# format df's
PROJ_COL_DROP = ['year', 'plyr', 'team']
RANK_COL_DROP = ['year', 'plyr', 'pos', 'team', 'rank', 'pos_rank', 'best', 'worst', 'avg', 'std_dev', 'ecr_adp']
ADP_COL_DROP = ['year', 'plyr', 'pos', 'team', 'pos_rank', 'avg']

proj_df = proj_df_raw.copy()
rank_df = rank_df_raw.copy()
adp_df = adp_df_raw.copy()
stat_df = stat_df_raw[['year-plyr-pos', 'pts/g']].copy()

proj_df.drop(columns=PROJ_COL_DROP, inplace=True)
rank_df.drop(columns=RANK_COL_DROP, inplace=True)
adp_df.drop(columns=ADP_COL_DROP, inplace=True)

rank_df.rename(columns={'pos_rank': 'rank_pos_rank', 'avg': 'rank_avg'}, inplace=True)
adp_df.rename(columns={'pos_rank': 'adp_pos_rank', 'avg': 'adp_avg'}, inplace=True)

In [None]:
# merge prediction-based dfs and fix dtypes
pred_df = proj_df.merge(rank_df, on='year-plyr-pos-team', how='inner')
pred_df = pred_df.merge(adp_df, on='year-plyr-pos-team', how='inner')
pred_df.drop(columns=['year-plyr-pos-team', 'year-plyr-pos_x', 'year-plyr-pos_y'], inplace=True)
for column in list(pred_df.columns):
    try:
        pred_df = pred_df.astype({column: 'float32'})
    except ValueError:
        continue

In [None]:
# merge to stat df to create 1 training df
df_train = stat_df.merge(pred_df, on='year-plyr-pos', how='inner')
df_train.set_index('year-plyr-pos', inplace=True)

# for column in list(df_train.columns):
#     if column in RANK_TYPE_COLUMNS:
#         df_train[column] = (TEAMS * ROSTER_SIZE) - df_train[column]

print(df_train.head(5))
print(df_train.shape)

In [None]:
# split training df to pos and X and y
qb_X_train = df_train[df_train['pos'] == 'qb'].copy().drop(columns=['pos', 'pts/g'])
rb_X_train = df_train[df_train['pos'] == 'rb'].copy().drop(columns=['pos', 'pts/g'])
wr_X_train = df_train[df_train['pos'] == 'wr'].copy().drop(columns=['pos', 'pts/g'])
te_X_train = df_train[df_train['pos'] == 'te'].copy().drop(columns=['pos', 'pts/g'])
qb_y_train = df_train[df_train['pos'] == 'qb']['pts/g'].copy()
rb_y_train = df_train[df_train['pos'] == 'rb']['pts/g'].copy()
wr_y_train = df_train[df_train['pos'] == 'wr']['pts/g'].copy()
te_y_train = df_train[df_train['pos'] == 'te']['pts/g'].copy()

In [None]:
# same thing for test data

# get data
db_file = f'{ROOT_DIR}/ff_ml.db'
conn = sqlite3.connect(db_file)

proj_df_current = pd.read_sql('SELECT * FROM projections_current', conn)
rank_df_current = pd.read_sql('SELECT * FROM ranks_current', conn)
adp_df_current = pd.read_sql('SELECT * FROM adps_current', conn)

conn.close()

In [None]:
# get references for later
player_ref = rank_df_current.merge(adp_df_current, on='year-plyr-pos', how='inner')

vbd_df = adp_df_current[adp_df_current['avg'] <= TEAMS * ROSTER_SIZE]
vbd_qb_baseline = sum(vbd_df['pos'] == 'qb')
vbd_rb_baseline = sum(vbd_df['pos'] == 'rb')
vbd_wr_baseline = sum(vbd_df['pos'] == 'wr')
vbd_te_baseline = sum(vbd_df['pos'] == 'te')

vbd_qb_starter = TEAMS
vbd_rb_starter = TEAMS * 2
vbd_wr_starter = TEAMS * 2
vbd_te_starter = TEAMS

vbd_qb_top_reserve = vbd_qb_starter * 1.5
vbd_rb_top_reserve = vbd_rb_starter * 1.5
vbd_wr_top_reserve = vbd_wr_starter * 1.5
vbd_te_top_reserve = vbd_te_starter * 1.5

vbd_qb_elite = vbd_qb_starter * 0.5
vbd_rb_elite = vbd_rb_starter * 0.5
vbd_wr_elite = vbd_wr_starter * 0.5
vbd_te_elite = vbd_te_starter * 0.5


In [None]:
# format df's
proj_df_current.drop(columns=PROJ_COL_DROP, inplace=True)
rank_df_current.drop(columns=RANK_COL_DROP, inplace=True)
adp_df_current.drop(columns=ADP_COL_DROP, inplace=True)

rank_df_current.rename(columns={'pos_rank': 'rank_pos_rank', 'avg': 'rank_avg'}, inplace=True)
adp_df_current.rename(columns={'pos_rank': 'adp_pos_rank', 'avg': 'adp_avg'}, inplace=True)

# merge prediction-based dfs and fix dtypes
df_test = proj_df_current.merge(rank_df_current, on='year-plyr-pos-team', how='inner')
df_test = df_test.merge(adp_df_current, on='year-plyr-pos-team', how='inner')
df_test.drop(columns=['year-plyr-pos-team', 'year-plyr-pos_x', 'year-plyr-pos_y'], inplace=True)
for column in list(df_test.columns):
    try:
        df_test = df_test.astype({column: 'float64'})
    except ValueError:
        continue

df_test.set_index('year-plyr-pos', inplace=True)

for column in list(df_test.columns):
    if column in RANK_TYPE_COLUMNS:
        df_test[column] = (TEAMS * ROSTER_SIZE) - df_test[column]

print(df_test.head(5))
print(df_test.shape)

In [None]:
# split test df to pos and X and y
qb_X_test = df_test[df_test['pos'] == 'qb'].copy().drop(columns=['pos'])
rb_X_test = df_test[df_test['pos'] == 'rb'].copy().drop(columns=['pos'])
wr_X_test = df_test[df_test['pos'] == 'wr'].copy().drop(columns=['pos'])
te_X_test = df_test[df_test['pos'] == 'te'].copy().drop(columns=['pos'])

In [None]:
# prediction model
# model determined by model_analysis

def regressor(X_train, y_train, X_test, model, degree, reference):
    poly_features = PolynomialFeatures(degree=degree, interaction_only=True)
    sc = StandardScaler()
    minmaxsc = MinMaxScaler(feature_range=(0,1))
    pca = PCA()
    poly_reg = Pipeline([
        ('poly', poly_features),
        ('minmax', minmaxsc),
        ('pca', pca),
        ('regressor', model)
    ])
    poly_reg.fit(X_train, y_train)
    pred_y = poly_reg.predict(X_test)
    df = pd.DataFrame({'name':X_test.index, 'pts':pred_y})
    # df.sort_values(by=['pts'], ascending=False, inplace=True)
    # df.reset_index(drop=True, inplace=True)
    return df

qb_models = {
    linear_model.ElasticNet(alpha=0.25): 2,
}
rb_models = {
    linear_model.ElasticNet(alpha=0.25): 2
}
wr_models = {
    linear_model.ElasticNet(alpha=0.25): 2
}
te_models = {
    linear_model.ElasticNet(alpha=0.25): 2
}

for idx, qb_model in enumerate(qb_models):
    temp_df = regressor(qb_X_train, qb_y_train, qb_X_test, qb_model, qb_models[qb_model], 'qb')
    if idx == 0:
        qb_pred_df = temp_df
    else:
        qb_pred_df = pd.concat((qb_pred_df, temp_df))

for idx, rb_model in enumerate(rb_models):
    temp_df = regressor(rb_X_train, rb_y_train, rb_X_test, rb_model, rb_models[rb_model], 'rb')
    if idx == 0:
        rb_pred_df = temp_df
    else:
        rb_pred_df = pd.concat((rb_pred_df, temp_df))

for idx, wr_model in enumerate(wr_models):
    temp_df = regressor(wr_X_train, wr_y_train, wr_X_test, wr_model, wr_models[wr_model], 'wr')
    if idx == 0:
        wr_pred_df = temp_df
    else:
        wr_pred_df = pd.concat((wr_pred_df, temp_df))

for idx, te_model in enumerate(te_models):
    temp_df = regressor(te_X_train, te_y_train, te_X_test, te_model, te_models[te_model], 'te')
    if idx == 0:
        te_pred_df = temp_df
    else:
        te_pred_df = pd.concat((te_pred_df, temp_df))

qb_pred_df = qb_pred_df.groupby('name', as_index=False).mean()
qb_pred_df.sort_values(by=['pts'], ascending=False, inplace=True)
qb_pred_df.reset_index(drop=True, inplace=True)
rb_pred_df = rb_pred_df.groupby('name', as_index=False).mean()
rb_pred_df.sort_values(by=['pts'], ascending=False, inplace=True)
rb_pred_df.reset_index(drop=True, inplace=True)
wr_pred_df = wr_pred_df.groupby('name', as_index=False).mean()
wr_pred_df.sort_values(by=['pts'], ascending=False, inplace=True)
wr_pred_df.reset_index(drop=True, inplace=True)
te_pred_df = te_pred_df.groupby('name', as_index=False).mean()
te_pred_df.sort_values(by=['pts'], ascending=False, inplace=True)
te_pred_df.reset_index(drop=True, inplace=True)

In [None]:
# set vbd
qb_baseline = qb_pred_df.loc[vbd_qb_baseline-1, 'pts']
rb_baseline = rb_pred_df.loc[vbd_rb_baseline-1, 'pts']
wr_baseline = wr_pred_df.loc[vbd_wr_baseline-1, 'pts']
te_baseline = te_pred_df.loc[vbd_te_baseline-1, 'pts']

qb_top_reserve = qb_pred_df.loc[vbd_qb_top_reserve-1, 'pts']
rb_top_reserve = rb_pred_df.loc[vbd_rb_top_reserve-1, 'pts']
wr_top_reserve = wr_pred_df.loc[vbd_wr_top_reserve-1, 'pts']
te_top_reserve = te_pred_df.loc[vbd_te_top_reserve-1, 'pts']

qb_starter = qb_pred_df.loc[vbd_qb_starter-1, 'pts']
rb_starter = rb_pred_df.loc[vbd_rb_starter-1, 'pts']
wr_starter = wr_pred_df.loc[vbd_wr_starter-1, 'pts']
te_starter = te_pred_df.loc[vbd_te_starter-1, 'pts']

qb_elite = qb_pred_df.loc[vbd_qb_elite-1, 'pts']
rb_elite = rb_pred_df.loc[vbd_rb_elite-1, 'pts']
wr_elite = wr_pred_df.loc[vbd_wr_elite-1, 'pts']
te_elite = te_pred_df.loc[vbd_te_elite-1, 'pts']

qb_pred_df['vbd_baseline'] = qb_pred_df['pts'] - qb_baseline
rb_pred_df['vbd_baseline'] = rb_pred_df['pts'] - rb_baseline
wr_pred_df['vbd_baseline'] = wr_pred_df['pts'] - wr_baseline
te_pred_df['vbd_baseline'] = te_pred_df['pts'] - te_baseline

qb_pred_df['vbd_top_reserve'] = qb_pred_df['pts'] - qb_top_reserve
rb_pred_df['vbd_top_reserve'] = rb_pred_df['pts'] - rb_top_reserve
wr_pred_df['vbd_top_reserve'] = wr_pred_df['pts'] - wr_top_reserve
te_pred_df['vbd_top_reserve'] = te_pred_df['pts'] - te_top_reserve

qb_pred_df['vbd_starter'] = qb_pred_df['pts'] - qb_starter
rb_pred_df['vbd_starter'] = rb_pred_df['pts'] - rb_starter
wr_pred_df['vbd_starter'] = wr_pred_df['pts'] - wr_starter
te_pred_df['vbd_starter'] = te_pred_df['pts'] - te_starter

qb_pred_df['vbd_elite'] = qb_pred_df['pts'] - qb_elite
rb_pred_df['vbd_elite'] = rb_pred_df['pts'] - rb_elite
wr_pred_df['vbd_elite'] = wr_pred_df['pts'] - wr_elite
te_pred_df['vbd_elite'] = te_pred_df['pts'] - te_elite

In [None]:
pred_df = pd.concat([qb_pred_df, rb_pred_df, wr_pred_df, te_pred_df])
pred_df = pred_df.merge(player_ref[['year-plyr-pos', 'plyr_x', 'pos_x', 'team_x', 'avg_x', 'avg_y']], left_on='name', right_on='year-plyr-pos')
pred_df.drop(columns=['name', 'year-plyr-pos'], inplace=True)
pred_df.rename(columns={
    'plyr_x': 'plyr',
    'pos_x': 'pos',
    'team_x': 'team', 
    'avg_x': 'rank',
    'avg_y': 'adp'
}, inplace=True)
pred_df = pred_df[['plyr', 'pos', 'team', 'rank', 'adp', 'pts', 'vbd_baseline', 'vbd_top_reserve', 'vbd_starter', 'vbd_elite']]

In [None]:
pred_df['vbd_sum'] = 0
for index, row in pred_df.iterrows():
    temp_vbd_sum = 0
    if pred_df.loc[index, 'vbd_baseline'] < 0:
        temp_vbd_sum += 0
    else:
        temp_vbd_sum += pred_df.loc[index, 'vbd_baseline']
    if pred_df.loc[index, 'vbd_top_reserve'] < 0:
        temp_vbd_sum += 0
    else:
        temp_vbd_sum += pred_df.loc[index, 'vbd_top_reserve']
    if pred_df.loc[index, 'vbd_starter'] < 0:
        temp_vbd_sum += 0
    else:
        temp_vbd_sum += pred_df.loc[index, 'vbd_starter']
    if pred_df.loc[index, 'vbd_elite'] < 0:
        temp_vbd_sum += 0
    else:
        temp_vbd_sum += pred_df.loc[index, 'vbd_elite']
    pred_df.loc[index, 'vbd_sum'] = temp_vbd_sum

In [None]:
total_pos_vbd = pred_df[pred_df['vbd_sum'] >= 0]['vbd_sum'].sum()
cost_per_vbd = TOTAL_MONEY / total_pos_vbd
pred_df['auction_value'] = pred_df['vbd_sum'] * cost_per_vbd
pred_df

In [None]:
# pred_df.to_csv(f'{ROOT_DIR}/test.csv')

In [None]:
def regressor(X_train, y_train, X_test, model, degree):
    poly_features = PolynomialFeatures(degree=degree, interaction_only=True)
    sc = StandardScaler()
    minmaxsc = MinMaxScaler(feature_range=(0,1))
    pca = PCA()
    poly_reg = Pipeline([
        ('poly', poly_features),
        ('minmax', minmaxsc),
        ('pca', pca),
        ('regressor', model)
    ])
    poly_reg.fit(X_train, y_train)
    fitted_regressor = poly_reg.named_steps['regressor']
    coefficients = fitted_regressor.coef_
    poly_feature_names = poly_reg.named_steps['poly'].get_feature_names_out()
    pca_feature_names = poly_feature_names[:poly_reg.named_steps['pca'].n_components_]

    df = pd.DataFrame([coefficients[:len(pca_feature_names)]], columns=pca_feature_names).T
    return df

models = {
    linear_model.ElasticNet(alpha=0.25): 2,
}

analysis_X_train = pd.concat([qb_X_train, rb_X_train, wr_X_train, te_X_train])
analysis_y_train = pd.concat([qb_y_train, rb_y_train, wr_y_train, te_y_train])
analysis_X_test = pd.concat([qb_X_test, rb_X_test, wr_X_test, te_X_test])

for idx, model in enumerate(models):
    temp_df = regressor(analysis_X_train, analysis_y_train, analysis_X_test, model, models[model])

# temp_df.to_csv(f'{ROOT_DIR}/analysis_of_current_model.csv')

In [None]:
analysis_X_train