### Import Packages

In [1]:
import timeit
import joblib
import random
import numpy as np
import pandas as pd
from datetime import datetime
from lib.helpers import min_games_filter, ohe_players, clean_nulls, prepare_dataset, get_feature_importance

In [2]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

### Import Data

In [3]:
# Import data
df = pd.read_excel("Cleaned_NBA1819_PlayerStats-DFS.xlsx")
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df = ohe_players(df)
df.shape

(27855, 567)

In [5]:
boxscore_stats = ['MINUTES', 'USAGE RATE', 'MIN', 'FG', 'FGA',
                   '3P', '3PA', 'FT', 'FTA', 'OR', 'DR', 'TOT', 
                   'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'YAHOO_FANTASYPOINTS']

boxscore_stats

['MINUTES',
 'USAGE RATE',
 'MIN',
 'FG',
 'FGA',
 '3P',
 '3PA',
 'FT',
 'FTA',
 'OR',
 'DR',
 'TOT',
 'A',
 'PF',
 'ST',
 'TO',
 'BL',
 'PTS',
 'YAHOO_FANTASYPOINTS']

### Data Cleaning

In [6]:
cleannull_df = clean_nulls(df, drop_date=True)
finalprep_df = prepare_dataset(cleannull_df, dfs_type="YH")

In [7]:
X = finalprep_df.drop(columns=boxscore_stats, axis=1)
y = np.asarray([list(x) for x in zip(finalprep_df['MINUTES'], finalprep_df['USAGE RATE'], finalprep_df['MIN'], finalprep_df['FG'], finalprep_df['FGA'],
                                     finalprep_df['3P'], finalprep_df['3PA'], finalprep_df['FT'], finalprep_df['FTA'], finalprep_df['OR'], finalprep_df['DR'], finalprep_df['TOT'],
                                     finalprep_df['A'], finalprep_df['PF'], finalprep_df['ST'], finalprep_df['TO'], finalprep_df['BL'], finalprep_df['PTS'], finalprep_df['YAHOO_FANTASYPOINTS'])])

In [None]:
# # Keep the dependent feature names
# X_colnames = finalprep_df.drop(['YAHOO_FANTASYPOINTS'], axis=1).columns
# X_colnames

In [16]:
#len(X_colnames)

### Train/Test Split

In [17]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit standardizer to training set
standardscaler = StandardScaler()
standardscaler.fit(X_train)

# Transform the feature variables
X_train_std = standardscaler.transform(X_train)
X_test_std = standardscaler.transform(X_test)

### Train Model

In [18]:
multi_rf_regressor = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, 
                                                                max_features='auto', 
                                                                random_state=42))

In [19]:
start = timeit.timeit()
multi_rf_regressor.fit(X_train_std, y_train)
end = timeit.timeit()
print(round(start - end, 4) / 60.0, "minutes")

2.1666666666666667e-05 minutes


In [20]:
multi_rf_regressor.score(X_test_std, y_test)

0.35444628340218837

In [22]:
# Pickle the model
joblib.dump(multi_rf_regressor, 'DFS_multi_rf_regressor.pkl')

# Load the model
# rf_regressor_joblib = joblib.load('DFS_rf_regressor.pkl')
# rf_regressor_joblib

['DFS_multi_rf_regressor.pkl']

#### Feature Importance

In [None]:
get_feature_importance(rf_regressor_joblib, X_colnames)

### Predictions

In [None]:
test_date = (test_sample['DATE'] == '2019-03-02') & (test_sample['OWNTEAM'] == 'Detroit')
pilot = test_sample[test_date]
pilot.head()

In [None]:
#pilot.info(verbose=True)

In [None]:
pilot_df = clean_nulls(pilot)
pilot.loc[:, boxscore_stats] = np.nan
pilot_prep_df = prepare_dataset(pilot_df, drop_date=True, dfs_type="YH")
pilot_prep_df = pilot_prep_df.drop(['YAHOO_FANTASYPOINTS'], axis=1)

In [None]:
pilot_prep_df.isna().sum()

In [None]:
pilot_scaler = StandardScaler()

pilot_scaler.fit(pilot_prep_df)
std_pilot_df = pilot_scaler.transform(pilot_prep_df)

In [None]:
std_pilot_df

In [None]:
# player_diff = list(set(test_prep_df.columns.values) - set(X_colnames))
# print(len(player_diff))
# player_diff

In [None]:
# Xnew = [std_test_df[100]]
# Xpilot = [std_pilot_df[100]]

# ynew = rf_regressor.predict(Xnew)
# print("Predicted (Yahoo Fantasy Points)=%s" % round(ynew[0], 2))

In [None]:
#rf_regressor.predict([X_test_std[100]])

In [None]:
#Xnew

### Actual

In [None]:
#df[1995:2000][['PLAYER', 'MINUTES', 'USAGE RATE', 'DAYSREST', 'YAHOO_FULLSLATE_SALARY', 'MIN', 'YAHOO_FANTASYPOINTS']] #['YAHOO_FANTASYPOINTS']

In [None]:
# df[(df['MINUTES'] > 30)  & \
#    (df['USAGE RATE'] > 20) & \
#    (df['DAYSREST'] == 0) & \
#    (df['YAHOO_FULLSLATE_SALARY'] == 37) & \
#    (df['MIN'] == 35.3)]['YAHOO_FANTASYPOINTS']

In [None]:
df.head()

### Example

In [None]:
from pandas import DataFrame
from sklearn import linear_model
import statsmodels.api as sm

In [None]:
Stock_Market = {'Year': [2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016],
                'Month': [12, 11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1],
                'Interest_Rate': [2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75],
                'Unemployment_Rate': [5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1],
                'Stock_Index_Price': [1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047,965,943,958,971,949,884,866,876,822,704,719]        
                }

df = DataFrame(Stock_Market,columns=['Year','Month','Interest_Rate','Unemployment_Rate','Stock_Index_Price'])
df.head()

In [None]:
X = df[['Interest_Rate','Unemployment_Rate']] # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
Y = df['Stock_Index_Price']

In [None]:
 # with sklearn
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept:', regr.intercept_)
print('Coefficients:', regr.coef_)

In [None]:
# prediction with sklearn
New_Interest_Rate = 2.75
New_Unemployment_Rate = 5.3
print ('Predicted Stock Index Price:', regr.predict([[New_Interest_Rate ,New_Unemployment_Rate]]))

In [None]:
# # with statsmodels
# X = sm.add_constant(X) # adding a constant
 
# model = sm.OLS(Y, X).fit()
# predictions = model.predict(X) 
 
# print_model = model.summary()
# print(print_model)

### Example 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

In [None]:
# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
y += (0.5 - rng.rand(*y.shape))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=400, test_size=200, random_state=4)

max_depth = 30
regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
                                                          max_depth=max_depth,
                                                          random_state=0))
regr_multirf.fit(X_train, y_train)

In [None]:
regr_multirf.score(X_test, y_test)

In [None]:
regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth,
                                random_state=2)
regr_rf.fit(X_train, y_train)

In [None]:
regr_rf.score(X_test, y_test)

In [None]:
# Predict on new data
y_multirf = regr_multirf.predict(X_test)
y_rf = regr_rf.predict(X_test)

# Plot the results
plt.figure()
s = 50
a = 0.4
plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k',
            c="navy", s=s, marker="s", alpha=a, label="Data")
plt.scatter(y_multirf[:, 0], y_multirf[:, 1], edgecolor='k',
            c="cornflowerblue", s=s, alpha=a,
            label="Multi RF score=%.2f" % regr_multirf.score(X_test, y_test))
plt.scatter(y_rf[:, 0], y_rf[:, 1], edgecolor='k',
            c="c", s=s, marker="^", alpha=a,
            label="RF score=%.2f" % regr_rf.score(X_test, y_test))
plt.xlim([-6, 6])
plt.ylim([-6, 6])
plt.xlabel("target 1")
plt.ylabel("target 2")
plt.title("Comparing random forests and the multi-output meta estimator")
plt.legend()
plt.show()

In [None]:
X_test