In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
te = pd.read_csv('../input/te/te_stats.csv')

In [4]:
te.isnull().sum()

Unnamed: 0       0
Name             0
Week           416
Opponent       416
tar_avg        416
rec_avg        416
yds_avg        416
td_avg         416
fum_avg        416
def_rk_avg     416
tar_ma         416
rec_ma         416
yds_ma         416
td_ma          416
fum_ma         416
def_rk_ma      416
tar_car        416
rec_car        416
yds_car        416
td_car         416
fum_car        416
def_rk_car     416
def_rk         416
h/a_1          416
surface_0      416
points         416
games          416
year           416
0             3167
dtype: int64

In [24]:
te.rename(columns={'Unnamed: 0':'player'}, inplace=True)

In [25]:

te.dtypes

player         object
Week            int64
Opponent       object
year            int64
var1(t-1)     float64
var2(t-1)     float64
var3(t-1)     float64
var4(t-1)     float64
var5(t-1)     float64
var6(t-1)     float64
var7(t-1)     float64
var8(t-1)     float64
var9(t-1)     float64
var10(t-1)    float64
var11(t-1)    float64
var12(t-1)    float64
var13(t-1)    float64
var14(t-1)    float64
var15(t-1)    float64
var1(t)       float64
var2(t)       float64
var3(t)       float64
var4(t)       float64
var5(t)       float64
var6(t)       float64
var7(t)       float64
var8(t)       float64
var9(t)       float64
var10(t)      float64
var11(t)      float64
var12(t)      float64
var13(t)      float64
var14(t)      float64
var15(t)      float64
def_rk          int64
h/a_1         float64
surface_0     float64
salary          int64
points        float64
dtype: object

In [26]:
# Replace nulls with zero and turn salary attribute to float
te.replace('null',0, inplace=True)
te.salary = te['salary'].apply(lambda x: float(x))

In [8]:
def conv_atts(df):
    df.rename(columns={'Unnamed: 0':'player'}, inplace=True)
    df.replace('null',0, inplace=True)
    df['salary'] = df['salary'].apply(lambda x: float(x))
    df['h/a_1'] = df['h/a_1'].astype('category')
    df['surface_0'] = df['surface_0'].astype('category')
    df['def_rk'] = df['def_rk'].astype('category')

    return df

In [27]:
te = conv_atts(te)
te.dtypes


player          object
Week             int64
Opponent        object
year             int64
var1(t-1)      float64
var2(t-1)      float64
var3(t-1)      float64
var4(t-1)      float64
var5(t-1)      float64
var6(t-1)      float64
var7(t-1)      float64
var8(t-1)      float64
var9(t-1)      float64
var10(t-1)     float64
var11(t-1)     float64
var12(t-1)     float64
var13(t-1)     float64
var14(t-1)     float64
var15(t-1)     float64
var1(t)        float64
var2(t)        float64
var3(t)        float64
var4(t)        float64
var5(t)        float64
var6(t)        float64
var7(t)        float64
var8(t)        float64
var9(t)        float64
var10(t)       float64
var11(t)       float64
var12(t)       float64
var13(t)       float64
var14(t)       float64
var15(t)       float64
def_rk        category
h/a_1         category
surface_0     category
salary         float64
points         float64
dtype: object

In [28]:
# Design train and test datasets

train = te[te['year']!=2018]
train = train[train['Week']!=4]
train['year'].unique()

array([2016, 2017, 2015], dtype=int64)

In [29]:
train.shape

(2594, 39)

In [30]:
test = te[te['year']==2018]
test=test[test['Week']==4]
test.shape

(62, 39)

In [31]:
test.columns

Index(['player', 'Week', 'Opponent', 'year', 'var1(t-1)', 'var2(t-1)',
       'var3(t-1)', 'var4(t-1)', 'var5(t-1)', 'var6(t-1)', 'var7(t-1)',
       'var8(t-1)', 'var9(t-1)', 'var10(t-1)', 'var11(t-1)', 'var12(t-1)',
       'var13(t-1)', 'var14(t-1)', 'var15(t-1)', 'var1(t)', 'var2(t)',
       'var3(t)', 'var4(t)', 'var5(t)', 'var6(t)', 'var7(t)', 'var8(t)',
       'var9(t)', 'var10(t)', 'var11(t)', 'var12(t)', 'var13(t)', 'var14(t)',
       'var15(t)', 'def_rk', 'h/a_1', 'surface_0', 'salary', 'points'],
      dtype='object')

In [32]:
# Target variables 

y_train = train['points']
y_test = test['points']

# Predictors, all numeric variables minus target variable
X_train = train.iloc[:,4:-2]
X_test = test.iloc[:,4:-2]

In [90]:
scaler = MinMaxScaler()
scaler.fit_transform(X_train)

array([[ 0.        ,  1.        ,  0.0070922 , ...,  0.        ,
         0.51998093,  0.14457831],
       [ 0.01960784,  0.8335    ,  0.02600473, ...,  0.19142857,
         0.5486886 ,  0.39759036],
       [ 0.21568627,  0.785     ,  0.23799054, ...,  0.21428571,
         0.62250835,  0.5       ],
       ..., 
       [ 0.46411765,  0.6547    ,  0.33174941, ...,  0.21428571,
         0.48526466,  0.81325301],
       [ 0.48039216,  0.6695    ,  0.39184397, ...,  0.22857143,
         0.45808298,  0.85542169],
       [ 0.58039216,  0.6414    ,  0.42411348, ...,  0.19142857,
         0.36185026,  0.8253012 ]])

In [140]:
X_test.shape

(32, 40)

In [141]:
X_train.shape

(1457, 40)

In [16]:
y_train.reset_index(inplace=True,drop=True)
X_train.reset_index(inplace=True, drop=True)

In [34]:
lr = LinearRegression()

In [35]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
coeff = pd.DataFrame(X_train.columns)
coeff['coefficients'] = lr.coef_
coeff



Unnamed: 0,0,coefficients
0,var1(t-1),-0.146245
1,var2(t-1),-0.715551
2,var3(t-1),-0.048592
3,var4(t-1),-3.153114
4,var5(t-1),-2.024551
5,var6(t-1),-0.054092
6,var7(t-1),-0.408811
7,var8(t-1),0.049238
8,var9(t-1),1.750512
9,var10(t-1),-2.536799


In [138]:
X_train.shape

(1457, 25)

In [37]:
yhat = lr.predict(X_test)

In [38]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse_val = rmse(yhat, y_test)
print("RMSE error is: " + str(rmse_val))

RMSE error is: 2.5081753355699448


In [39]:
results = pd.DataFrame()
results['Player'] = test['player']
#results['score'] = y_test
results['Predicted'] = yhat
results['points'] = test['points']
#results['Week'] = qb[qb['year']==2017]['Week']
#results.groupby('Player').mean()[['score','Predicted']]
results

Unnamed: 0,Player,Predicted,points
101,Antonio Gates_reframed,9.107350,10.7
103,Antony Auclair_reframed,1.547872,1.8
133,Austin Hooper_reframed,6.449231,2.9
165,Austin Seferian-Jenkins_reframed,5.226458,4.4
219,Benjamin Watson_reframed,5.675421,3.3
359,CJ Uzomah_reframed,3.715359,1.6
404,Cameron Brate_reframed,11.528196,11.9
448,Charles Clay_reframed,6.719608,8.0
457,Chris Herndon IV_reframed,2.199941,2.8
584,Dallas Goedert_reframed,3.995201,3.3


In [42]:
results.head()

Unnamed: 0,Player,Predicted,points
101,Antonio Gates_reframed,9.10735,10.7
103,Antony Auclair_reframed,1.547872,1.8
133,Austin Hooper_reframed,6.449231,2.9
165,Austin Seferian-Jenkins_reframed,5.226458,4.4
219,Benjamin Watson_reframed,5.675421,3.3


In [46]:
results.to_csv('../output/te_test.csv', index= False)

In [47]:
te_points['projected'] = te_points['player'].map(lambda x: x['Predicted'], results)

NameError: name 'te_points' is not defined

In [None]:
te_poins = 