In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Load Data
data = pd.read_csv('/Users/sauce/Desktop/DraftKings/ready_data/DraftKingsCleaned.csv', header = None)

# rename columns
data = data.rename(columns={0: "season", 
                        1:'game_date',
                        2: 'player',
                        3: 'team',
                        4: 'opponent',
                        5: 'venue',
                        6: 'minutes',
                        7: 'usage_rate',
                        8: 'rest',
                        9: 'avg_threes',
                        10: 'avg_reb',
                        11: 'avg_ast',
                        12: 'avg_stl',
                        13: 'avg_blk',
                        14: 'avg_tov',
                        15: 'avg_pts',
                        16: 'avg_points_vs_opp',
                        17: 'team_pace',
                        18: 'team_ast',
                        19: 'team_tov',
                        20: 'team_reb_rate',
                        21: 'team_offeff',
                        22: 'team_defeff',
                        23: 'opp_pace',
                        24: 'opp_ast',
                        25: 'opp_tov',
                        26: 'opp_reb_rate',
                        27: 'opp_offeff',
                        28: 'opp_defeff',
                        29: 'opp_pos_avg',
                        30: 'salary',
                        31: 'fantasy_points',
                       })

# Avg 10 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(10).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
data = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG10'])

# Avg 3 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(3).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
df = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG3'])

# Drop useless features
df = df.drop(columns=['team_AVG10','opponent_AVG10','venue_AVG10','rest_AVG10',
                      'avg_threes_AVG10','avg_reb_AVG10','avg_ast_AVG10','avg_stl_AVG10',
                      'avg_blk_AVG10','avg_tov_AVG10','avg_pts_AVG10','avg_points_vs_opp_AVG10',
                      'team_pace_AVG10','team_ast_AVG10','team_tov_AVG10','team_reb_rate_AVG10',
                      'team_defeff_AVG10','opp_pace_AVG10','opp_ast_AVG10','opp_tov_AVG10',
                      'opp_reb_rate_AVG10','opp_offeff_AVG10','salary_AVG10',
                      'team_AVG3', 'opponent_AVG3', 'venue_AVG3','rest_AVG3',
                      'avg_threes_AVG3', 'avg_reb_AVG3','avg_ast_AVG3', 'avg_stl_AVG3', 
                      'avg_blk_AVG3', 'avg_tov_AVG3','avg_pts_AVG3', 
                      'avg_points_vs_opp_AVG3', 'team_pace_AVG3','team_ast_AVG3', 
                      'team_tov_AVG3', 'team_reb_rate_AVG3','team_defeff_AVG3', 
                      'opp_pace_AVG3','opp_ast_AVG3', 'opp_tov_AVG3', 'opp_reb_rate_AVG3',
                      'opp_offeff_AVG3', 'opp_pos_avg_AVG3','salary_AVG3',
                      'team_AVG10_AVG3','opponent_AVG10_AVG3', 'venue_AVG10_AVG3', 
                      'minutes_AVG10_AVG3','usage_rate_AVG10_AVG3', 'rest_AVG10_AVG3',
                      'avg_threes_AVG10_AVG3', 'avg_reb_AVG10_AVG3',
                      'avg_ast_AVG10_AVG3', 'avg_stl_AVG10_AVG3', 'avg_blk_AVG10_AVG3',
                      'avg_tov_AVG10_AVG3', 'avg_pts_AVG10_AVG3',
                      'avg_points_vs_opp_AVG10_AVG3', 'team_pace_AVG10_AVG3',
                      'team_ast_AVG10_AVG3', 'team_tov_AVG10_AVG3',
                      'team_reb_rate_AVG10_AVG3', 'team_offeff_AVG10_AVG3',
                      'team_defeff_AVG10_AVG3', 'opp_pace_AVG10_AVG3',
                      'opp_ast_AVG10_AVG3', 'opp_tov_AVG10_AVG3',
                      'opp_reb_rate_AVG10_AVG3', 'opp_offeff_AVG10_AVG3',
                      'opp_defeff_AVG10_AVG3', 'opp_pos_avg_AVG10_AVG3',
                      'salary_AVG10_AVG3', 'fantasy_points_AVG10_AVG3'])

# Fill NAs
df['fantasy_points_AVG3'] = df['fantasy_points_AVG3'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['opp_defeff_AVG3'] = df['opp_defeff_AVG3'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG3'] = df['team_offeff_AVG3'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG3'] = df['usage_rate_AVG3'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['fantasy_points_AVG10'] = df['fantasy_points_AVG10'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['minutes_AVG3'] = df['minutes_AVG3'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))
df['opp_pos_avg_AVG10'] = df['opp_pos_avg_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_pos_avg'].transform('mean'))
df['opp_defeff_AVG10'] = df['opp_defeff_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG10'] = df['team_offeff_AVG10'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG10'] = df['usage_rate_AVG10'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['minutes_AVG10'] = df['minutes_AVG10'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))

# Drop non predictive columns 
df2 = df.drop(columns=['game_date', 'season', 'player', 'team', 'opponent',
                     'minutes', 'usage_rate', 'salary'])
# Encode Dummies
df2 = pd.get_dummies(df2, columns=['venue', 'rest'], drop_first=True)

# Grab Target Variable and remove it from data.
y = df2['fantasy_points']
X = df2.drop(columns = ['fantasy_points'])

# Split data into train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Scale data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [4]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 500, min_samples_leaf = 3,  random_state = 0, n_jobs = -1, min_samples_split = 2) 
regressor.fit(X_train, y_train)
pred = regressor.predict(X_test)
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
print('R2 Score:', metrics.r2_score(y_test, pred))

Mean Squared Error: 36.5373911589859
R2 Score: 0.8141272495091534


In [5]:
pred

array([23.7358033 ,  3.98953931, 29.45736418, ..., 24.62907309,
        5.04640317, 28.66897398])