In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder 
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

In [108]:
# Classification Packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [84]:
ff = pd.read_csv('FantasyPros_Fantasy_Football_Advanced_Stats_Report_WR.csv')

In [85]:
ff.head()

Unnamed: 0,Rank,Player,G,REC,YDS,Y/R,YBC,YBC/R,AIR,AIR/R,...,% TM,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,LNG
0,1.0,Davante Adams (GB),14.0,115.0,1374,11.9,777,6.8,777,6.8,...,29.7%,116.0,1.0,28.0,49.0,18.0,10.0,5.0,1.0,56.0
1,2.0,Tyreek Hill (KC),15.0,87.0,1276,14.7,842,9.7,842,9.7,...,22.1%,94.0,5.0,17.0,49.0,20.0,11.0,5.0,2.0,75.0
2,3.0,Stefon Diggs (BUF),16.0,127.0,1535,12.1,1071,8.4,1071,8.4,...,29.0%,137.0,8.0,16.0,62.0,20.0,5.0,5.0,2.0,55.0
3,4.0,Calvin Ridley (ATL),15.0,90.0,1374,15.3,1099,12.2,1099,12.2,...,23.6%,96.0,6.0,20.0,58.0,23.0,8.0,3.0,2.0,63.0
4,5.0,D.K. Metcalf (SEA),16.0,83.0,1303,15.7,940,11.3,940,11.3,...,24.0%,92.0,8.0,16.0,50.0,17.0,12.0,5.0,3.0,62.0


because we are looking at advanced statistics we must get rid of some of the more basic statistics. Specifically receptions, games, targets, and yards.

In [86]:
display(ff.columns)

Index(['Rank', 'Player', 'G', 'REC', 'YDS', 'Y/R', 'YBC', 'YBC/R', 'AIR',
       'AIR/R', 'YAC', 'YAC/R', 'YACON', 'YACON/R', 'BRKTKL', 'TGT', '% TM',
       'CATCHABLE', 'DROP', 'RZ TGT', '10+ YDS', '20+ YDS', '30+ YDS',
       '40+ YDS', '50+ YDS', 'LNG'],
      dtype='object')

In [87]:
ff_adj = ff[['Rank', 'Player', 'Y/R', 'YBC', 'YBC/R', 'AIR','AIR/R', 'YAC', 'YAC/R', 'YACON', 'YACON/R', 'BRKTKL', '% TM',
'CATCHABLE', 'DROP', 'RZ TGT', '10+ YDS', '20+ YDS', '30+ YDS','40+ YDS', '50+ YDS', 'LNG']]

we must change the variable '% TM' from a percentage to a float, so it can be evaluated numerically.

In [88]:
ff_adj['% TM'] = ff_adj['% TM'].str.rstrip('%').astype('float') / 100.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ff_adj['% TM'] = ff_adj['% TM'].str.rstrip('%').astype('float') / 100.0


In [89]:
corr = ff_adj.corr()
corr['Rank'].nsmallest(6)

10+ YDS     -0.920567
% TM        -0.917860
CATCHABLE   -0.915263
20+ YDS     -0.879548
YAC         -0.861921
RZ TGT      -0.853427
Name: Rank, dtype: float64

Because the better the rank the smaller the number, we must look at the smallest rather than largest correlation.

In [90]:
ff_final = ff_adj[['Rank', 'Player','10+ YDS', '% TM', 'CATCHABLE', '20+ YDS', 'YAC', 'RZ TGT']]

In [92]:
df = ff_final.dropna()
df.isna().sum()

Rank         0
Player       0
10+ YDS      0
% TM         0
CATCHABLE    0
20+ YDS      0
YAC          0
RZ TGT       0
dtype: int64

I need to create a variable where I can test my accuracy so I use the 

In [105]:
df['Top 50'] = np.where(df['Rank'] < 51, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Top 50'] = np.where(df['Rank'] < 51, 1, 0)


In [106]:
df.head(2)

Unnamed: 0,Rank,Player,10+ YDS,% TM,CATCHABLE,20+ YDS,YAC,RZ TGT,Top 50
0,1.0,Davante Adams (GB),49.0,0.297,116.0,18.0,597.0,28.0,1
1,2.0,Tyreek Hill (KC),49.0,0.221,94.0,20.0,434.0,17.0,1


In [107]:
df.tail(2)

Unnamed: 0,Rank,Player,10+ YDS,% TM,CATCHABLE,20+ YDS,YAC,RZ TGT,Top 50
197,198.0,Tavon Austin (JAC),0.0,0.009,5.0,0.0,42.0,0.0,0
198,199.0,Andre Roberts (LAC),2.0,0.011,5.0,0.0,5.0,1.0,0


In [104]:
x = df[['10+ YDS', '% TM', 'CATCHABLE', '20+ YDS', 'YAC', 'RZ TGT']]
y = df[['Top 50']]

In [94]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 5)