# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/football.csv', parse_dates=[4])
df = df.sort_values(['name', 'week']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3154 entries, 0 to 3153
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   name        3154 non-null   object        
 1   position    3154 non-null   object        
 2   yards       3154 non-null   float64       
 3   week        3154 non-null   int64         
 4   fetched_at  3154 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 123.3+ KB


In [3]:
df.shape

(3154, 5)

In [4]:
df.sample(10)

Unnamed: 0,name,position,yards,week,fetched_at
213,Ashton Dulin,WR,13.0,8,2020-11-09 15:53:58.887999
10,A.J. Dillon,RB,11.0,7,2020-11-09 15:53:56.069259
2448,Odell Beckham,WR,75.0,5,2020-11-09 15:53:46.280021
1437,Jalen Guyton,WR,14.0,2,2020-11-09 15:53:23.226187
552,Chris Thompson,RB,42.0,5,2020-11-09 15:53:46.280021
2069,Levine Toilolo,TE,0.0,4,2020-11-09 15:53:39.185434
369,Byron Pringle,WR,23.0,5,2020-11-09 15:53:46.280021
223,Austin Ekeler,RB,87.0,1,2020-11-09 15:53:14.844588
1352,J.J. Taylor,RB,-1.0,2,2020-11-09 15:53:23.226187
1822,Justice Hill,RB,0.0,6,2020-11-09 15:53:48.807659


In [5]:
df[df['name'] == 'Aaron Rodgers'].head(3)

Unnamed: 0,name,position,yards,week,fetched_at
25,Aaron Rodgers,QB,366.0,1,2020-11-09 15:53:14.844588
26,Aaron Rodgers,QB,246.0,2,2020-11-09 15:53:23.226187
27,Aaron Rodgers,QB,295.0,3,2020-11-09 15:53:31.482681


In [6]:
aaron = df[df['name'] == 'Aaron Rodgers'].copy()

In [7]:
aaron['yards_1'] = aaron['yards'].shift(1)
aaron['yards_2'] = aaron['yards'].shift(2)

In [8]:
aaron.head(10)

Unnamed: 0,name,position,yards,week,fetched_at,yards_1,yards_2
25,Aaron Rodgers,QB,366.0,1,2020-11-09 15:53:14.844588,,
26,Aaron Rodgers,QB,246.0,2,2020-11-09 15:53:23.226187,366.0,
27,Aaron Rodgers,QB,295.0,3,2020-11-09 15:53:31.482681,246.0,366.0
28,Aaron Rodgers,QB,332.0,4,2020-11-09 15:53:39.185434,295.0,246.0
29,Aaron Rodgers,QB,174.0,6,2020-11-09 15:53:48.807659,332.0,295.0
30,Aaron Rodgers,QB,283.0,7,2020-11-09 15:53:56.069259,174.0,332.0
31,Aaron Rodgers,QB,300.0,8,2020-11-09 15:53:58.887999,283.0,174.0


In [9]:
df['yards_1'] = df.groupby('name')['yards'].shift(1)
df['yards_2'] = df.groupby('name')['yards'].shift(2)

In [10]:
df = df.dropna(subset=["yards_1", "yards_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [11]:
target = 'yards'
y = df[target]
X = df[['position', 'yards_1', 'yards_2']]

In [12]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [15]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [16]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [17]:
Z_train

Unnamed: 0,position_QB,position_RB,position_TE,position_WR,yards_1,yards_2
2,0,0,0,1,0.382029,-0.143412
3,0,0,0,1,0.055700,0.412711
4,0,0,0,1,1.273157,0.076451
7,0,1,0,0,-0.433793,-0.466740
8,0,1,0,0,-0.647162,-0.427940
...,...,...,...,...,...,...
2830,0,0,1,0,-0.496548,-0.647803
2833,0,1,0,0,-0.308282,-0.039947
2836,1,0,0,0,-0.546753,0.192849
2837,1,0,0,0,-0.559304,-0.544338


In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [19]:
model.score(Z_train, y_train)

0.7557139426106689

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
mean_squared_error(y_test, model.predict(Z_test)) ** (1/2)

35.87262338741972

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [22]:
demo = pd.DataFrame({
    'position': ['RB', 'QB', 'TE', 'WR']
})

pd.get_dummies(demo)

Unnamed: 0,position_QB,position_RB,position_TE,position_WR
0,0,1,0,0
1,1,0,0,0
2,0,0,1,0
3,0,0,0,1


In [23]:
demo_2 = pd.DataFrame({
    'position': ['TE/WR', 'RB/WR', 'QB-TE', 'TE', 'QB']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_QB,position_QB-TE,position_RB/WR,position_TE,position_TE/WR
0,0,0,0,0,1
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,1,0,0,0,0


In [24]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [25]:
X_train.sample(5)

Unnamed: 0,position,yards_1,yards_2
1632,WR,62.0,56.0
718,RB,146.0,199.0
1913,WR,29.0,62.0
1904,WR,46.0,43.0
734,WR,29.0,0.0


In [26]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])

In [27]:
lb.classes_

array(['QB', 'RB', 'TE', 'WR'], dtype='<U2')

In [28]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [29]:
lb.transform(new['position'])

array([[0, 0, 0, 0]])

In [30]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [31]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['yards_1'], [SimpleImputer(), StandardScaler()]), 
    (['yards_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [32]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_QB,position_RB,position_TE,position_WR,yards_1,yards_2
2,0,0,0,1,0.382029,-0.143412
3,0,0,0,1,0.0557,0.412711
4,0,0,0,1,1.273157,0.076451
7,0,1,0,0,-0.433793,-0.46674
8,0,1,0,0,-0.647162,-0.42794
9,0,1,0,0,-0.609508,-0.647803
10,0,1,0,0,-0.258077,-0.609004
11,0,1,0,0,-0.5091,-0.246877
14,0,0,0,1,-0.28318,0.011785
15,0,0,0,1,-0.195322,-0.272743


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [33]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['yards_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['yards_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [34]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [35]:
del pipe

In [36]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [37]:
pipe.score(X_train, y_train)

0.7557139426106689

In [38]:
pipe.predict(X_train)[:10]

array([49.5 , 56.25, 76.5 , 25.75, 21.75, 16.5 , 26.25, 29.75, 37.5 ,
       32.  ])

In [39]:
X_train.sample(1).to_dict(orient='list')

{'position': ['WR'], 'yards_1': [0.0], 'yards_2': [0.0]}

In [40]:
new = pd.DataFrame({
    'position': ['WR'], 
    'yards_1': [28.0], 
    'yards_2': [20.0]
})

In [41]:
pipe.predict(new)

array([26.5])