In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [5]:
qb = pd.read_csv('../input/qb/qb_stats.csv')
qb_cur = pd.read_csv('../input/qb/qb_cur.csv')

In [6]:
qb.shape

(2428, 21)

In [7]:
qb_cur.head()

Unnamed: 0,Name,Team,year,Week,Opponent,def_rk,h/a_1,surface_0
0,Patrick Mahomes,KC,2018,9,CLE,12,0.0,1.0
1,Cam Newton,CAR,2018,9,TB,15,1.0,1.0
2,Deshaun Watson,HOU,2018,9,DEN,26,0.0,1.0
3,Kirk Cousins,MIN,2018,9,DET,20,1.0,0.0
4,Drew Brees,NO,2018,9,LAR,8,1.0,0.0


In [15]:
qb.dtypes

year           int64
Name          object
Week           int64
games          int64
Opponent      object
att_avg      float64
comp%_avg    float64
yds_avg      float64
td_avg       float64
att_ma       float64
comp%_ma     float64
yds_ma       float64
td_ma        float64
att_car      float64
comp%_car    float64
yds_car      float64
td_car       float64
surface_0    float64
h/a_1        float64
def_rk         int64
points       float64
dtype: object

In [16]:
def conv_atts(df):
    df['h/a_1'] = df['h/a_1'].astype('category')
    df['surface_0'] = df['surface_0'].astype('category')

    return df

In [17]:
qb = conv_atts(qb)
qb.dtypes


year            int64
Name           object
Week            int64
games           int64
Opponent       object
att_avg       float64
comp%_avg     float64
yds_avg       float64
td_avg        float64
att_ma        float64
comp%_ma      float64
yds_ma        float64
td_ma         float64
att_car       float64
comp%_car     float64
yds_car       float64
td_car        float64
surface_0    category
h/a_1        category
def_rk          int64
points        float64
dtype: object

In [18]:
qb.isnull().sum()

year         0
Name         0
Week         0
games        0
Opponent     0
att_avg      0
comp%_avg    0
yds_avg      0
td_avg       0
att_ma       0
comp%_ma     0
yds_ma       0
td_ma        0
att_car      0
comp%_car    0
yds_car      0
td_car       0
surface_0    0
h/a_1        0
def_rk       0
points       0
dtype: int64

In [19]:
qb = qb.sort_values(['year','Week']).reset_index(drop=True)
qb.tail()

Unnamed: 0,year,Name,Week,games,Opponent,att_avg,comp%_avg,yds_avg,td_avg,att_ma,...,yds_ma,td_ma,att_car,comp%_car,yds_car,td_car,surface_0,h/a_1,def_rk,points
2423,2018,Sam Darnold,8,8,CHI,31.57,57.37,221.71,1.43,31.571429,...,221.714286,1.428571,31.57,57.37,221.71,1.43,1.0,0.0,4,12.32
2424,2018,Taylor Heinicke,8,1,BAL,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25,0.32
2425,2018,Taysom Hill,8,7,MIN,0.33,8.33,1.67,0.0,0.333333,...,1.666667,0.0,0.33,8.33,1.67,0.0,0.0,0.0,3,3.06
2426,2018,Teddy Bridgewater,8,14,MIN,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0
2427,2018,Tom Brady,8,80,BUF,35.71,67.16,268.0,2.29,35.714286,...,268.0,2.285714,35.88,64.84,258.25,2.25,0.0,0.0,9,16.76


In [20]:
test.index[0]

2385

In [20]:
test = qb[qb['year']==2018]
test = test[test['Week']==8]
test.head()

Unnamed: 0,year,Name,Week,games,Opponent,att_avg,comp%_avg,yds_avg,td_avg,att_ma,...,yds_ma,td_ma,att_car,comp%_car,yds_car,td_car,surface_0,h/a_1,def_rk,points
2390,2018,Aaron Rodgers,8,74,LAR,42.33,61.72,332.83,2.0,42.333333,...,332.833333,2.0,42.71,61.16,326.71,2.14,1.0,0.0,8,15.94
2391,2018,Alex Smith,8,67,NYG,32.67,62.5,230.5,1.17,32.666667,...,230.5,1.166667,33.57,62.73,241.0,1.14,0.0,0.0,5,12.42
2392,2018,Andrew Luck,8,69,OAK,44.43,65.73,278.29,2.86,44.428571,...,278.285714,2.857143,44.43,65.73,278.29,2.86,1.0,0.0,19,23.36
2393,2018,Andy Dalton,8,72,TB,36.86,63.73,260.29,2.14,36.857143,...,260.285714,2.142857,37.75,62.3,255.5,2.25,0.0,1.0,15,20.2
2394,2018,Baker Mayfield,8,6,PIT,37.4,59.72,258.2,1.2,37.4,...,258.2,1.2,37.4,59.72,258.2,1.2,1.0,0.0,7,15.1


In [21]:
train = qb[:test.index[0]]
train.tail()

Unnamed: 0,year,Name,Week,games,Opponent,att_avg,comp%_avg,yds_avg,td_avg,att_ma,...,yds_ma,td_ma,att_car,comp%_car,yds_car,td_car,surface_0,h/a_1,def_rk,points
2385,2018,Philip Rivers,7,78,TEN,32.33,68.75,283.67,2.5,32.333333,...,283.666667,2.5,33.0,69.74,298.43,2.57,1.0,1.0,22,23.14
2386,2018,Sam Darnold,7,7,MIN,29.83,60.18,224.33,1.5,29.833333,...,224.333333,1.5,29.83,60.18,224.33,1.5,0.0,1.0,9,16.04
2387,2018,Sean Mannion,7,1,SF,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11,0.1
2388,2018,Taysom Hill,7,6,BAL,0.4,10.0,2.0,0.0,0.4,...,2.0,0.0,0.4,10.0,2.0,0.0,1.0,0.0,15,2.5
2389,2018,Tom Brady,7,79,CHI,35.67,66.78,266.5,2.17,35.666667,...,266.5,2.166667,35.86,64.19,255.57,2.14,1.0,0.0,20,22.68


In [22]:
test.shape

(38, 21)

In [23]:
train.shape

(2390, 21)

In [31]:
df_18['Week'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [32]:
test.columns

Index(['year', 'Name', 'Week', 'Opponent', 'games', 'att_avg', 'comp%_avg',
       'yds_avg', 'td_avg', 'att_ma', 'comp%_ma', 'yds_ma', 'td_ma', 'att_car',
       'comp%_car', 'yds_car', 'td_car', 'surface_0', 'h/a_1', 'def_rk',
       'points'],
      dtype='object')

In [10]:
test.columns

NameError: name 'test' is not defined

In [34]:
# Target variables 

y_train = train['points']
y_test = test['points']

# Predictors, all numeric variables minus target variable
X_train = train.iloc[:,4:-1]
X_test = test.iloc[:,4:-1]

In [25]:
test[test['Name']=='Tom Brady']

Unnamed: 0,year,Name,Week,games,Opponent,att_avg,comp%_avg,yds_avg,td_avg,att_ma,...,yds_ma,td_ma,att_car,comp%_car,yds_car,td_car,surface_0,h/a_1,def_rk,points
2427,2018,Tom Brady,8,80,BUF,35.71,67.16,268.0,2.29,35.714286,...,268.0,2.285714,35.88,64.84,258.25,2.25,0.0,0.0,9,16.76


In [35]:

X_test.columns

Index(['games', 'att_avg', 'comp%_avg', 'yds_avg', 'td_avg', 'att_ma',
       'comp%_ma', 'yds_ma', 'td_ma', 'att_car', 'comp%_car', 'yds_car',
       'td_car', 'surface_0', 'h/a_1', 'def_rk'],
      dtype='object')

In [36]:
scaler = MinMaxScaler()
scaler.fit_transform(X_train)

array([[0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.12903226],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.51612903],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.96774194],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.32258065],
       [0.0625    , 0.00645161, 0.1       , ..., 1.        , 0.        ,
        0.4516129 ],
       [0.975     , 0.57532258, 0.6678    , ..., 1.        , 0.        ,
        0.61290323]])

In [37]:
X_test.shape

(38, 16)

In [38]:
X_train.shape

(2390, 16)

In [16]:
y_train.reset_index(inplace=True,drop=True)
X_train.reset_index(inplace=True, drop=True)

In [39]:
lr = LinearRegression()

In [40]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
coeff = pd.DataFrame(X_train.columns)
coeff['coefficients'] = lr.coef_
coeff



Unnamed: 0,0,coefficients
0,games,0.051099
1,att_avg,0.033095
2,comp%_avg,-0.087637
3,yds_avg,0.011961
4,td_avg,1.000565
5,att_ma,-0.173532
6,comp%_ma,-0.02921
7,yds_ma,0.014323
8,td_ma,1.810278
9,att_car,0.042001


In [42]:
X_train.shape

(2390, 16)

In [43]:
yhat = lr.predict(X_test)

In [44]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse_val = rmse(yhat, y_test)
print("RMSE error is: " + str(rmse_val))

RMSE error is: 7.3332604844036124


In [46]:
results = pd.DataFrame()
results['Player'] = test['Name']
#results['score'] = y_test
results['Predicted'] = yhat
results['points'] = test['points']
#results['Week'] = qb[qb['year']==2017]['Week']
#results.groupby('Player').mean()[['score','Predicted']]
results

Unnamed: 0,Player,Predicted,points
2390,Aaron Rodgers,22.633127,15.94
2391,Alex Smith,17.452866,12.42
2392,Andrew Luck,24.275479,23.36
2393,Andy Dalton,23.072337,20.2
2394,Baker Mayfield,14.848892,15.1
2395,Ben Roethlisberger,23.918496,17.88
2396,Blake Bortles,19.003995,19.74
2397,Brandon Weeden,20.653366,-0.1
2398,Brian Hoyer,13.611653,-0.2
2399,Brock Osweiler,16.823254,8.64


In [83]:
results.

SyntaxError: invalid syntax (<ipython-input-83-905f1e55433b>, line 1)

In [24]:
results.to_csv('../output/qb_test.csv', index= False)