In [2]:
# Import Scraping Modules 

from urllib.request import urlopen
from bs4 import BeautifulSoup

#Import data manipulation and viz packages 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib as mpl
import statistics 

In [3]:
#url page for NFL Passing Leaders for 2020-2021 Season 

url = 'https://www.pro-football-reference.com/years/2020/rushing.htm'

#Use the scraping module to open url and pass to Beautiful Soup

html = urlopen(url)

stats_sheet = BeautifulSoup(html)

In [8]:
#Fetch column header - 1st row; since the chart has two levels: Games & Rushing as 1st row, we will select index 1 for column headers

column_headers = stats_sheet.findAll('tr')[1]

column_headers = [i.getText() for i in column_headers.findAll('th')]

print(column_headers)

table_row = stats_sheet.findAll('tr')[2:]

rush_stats = []

for i in range(len(table_row)):
    rush_stats.append([col.getText() for col in table_row[i].findAll('td')])
    
print(rush_stats[0])    

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'Att', 'Yds', 'TD', '1D', 'Lng', 'Y/A', 'Y/G', 'Fmb']
['Derrick Henry *+', 'TEN', '26', 'RB', '16', '16', '378', '2027', '17', '98', '94', '5.4', '126.7', '3']


In [20]:
#We will have our column heads and stats_sheet. Let's create the dataframe

rush_df = pd.DataFrame(rush_stats, columns=column_headers[1:])

#View the head of our new dataframe
rush_df.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb
0,Derrick Henry *+,TEN,26,RB,16,16,378,2027,17,98,94,5.4,126.7,3
1,Dalvin Cook*,MIN,25,RB,14,14,312,1557,16,91,70,5.0,111.2,5
2,Josh Jacobs*,LVR,22,RB,15,15,273,1065,12,61,28,3.9,71.0,2
3,David Montgomery,CHI,23,RB,15,14,247,1070,8,59,80,4.3,71.3,1
4,Ezekiel Elliott,DAL,25,RB,15,15,244,979,6,62,31,4.0,65.3,6


Games
G -- Games played
<br>
GS -- Games started as an offensive or defensive player
<br>
Rushing
<br>
Att -- Rushing Attempts (sacks not included in NFL)
<br>
Yds -- Rushing Yards Gained (sack yardage is not included by NFL)
<br>
TD -- Rushing Touchdowns
<br>
1D -- First downs rushing
<br>
Lng -- Longest Rushing Attempt
<br>
Y/A -- Rushing Yards per Attempt
Minimum 6.25 rushes per game scheduled to qualify as leader.
Minimum 750 rushes to qualify as career leader.
<br>
Y/G -- Rushing Yards per Game
(minimum half a game per game scheduled to qualify as leader)
(Rushing Yards)/(Games Played)
<br>
Fumbles
Fmb -- Number of times fumbled both lost and recovered by own team


In [22]:
#Select categories for viz and analysis 

# Att, Yds, TD, 1D, Y/A, Fmb

categories = ['Att', 'Yds', 'TD', '1D', 'Y/A', 'Fmb']

final_df = rush_df[['Player', 'Tm'] + categories]

final_df.head()

Unnamed: 0,Player,Tm,Att,Yds,TD,1D,Y/A,Fmb
0,Derrick Henry *+,TEN,378,2027,17,98,5.4,3
1,Dalvin Cook*,MIN,312,1557,16,91,5.0,5
2,Josh Jacobs*,LVR,273,1065,12,61,3.9,2
3,David Montgomery,CHI,247,1070,8,59,4.3,1
4,Ezekiel Elliott,DAL,244,979,6,62,4.0,6


In [24]:
#Check the data types of final subset used for our analysis 
final_df.dtypes

Player    object
Tm        object
Att       object
Yds       object
TD        object
1D        object
Y/A       object
Fmb       object
dtype: object

In [29]:
#We must convert the categories to numerical data since we cannot analyze objects for this project

for i in categories:
    final_df[i] = pd.to_numeric(final_df[i])
    
final_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[i] = pd.to_numeric(final_df[i])


Player     object
Tm         object
Att       float64
Yds       float64
TD        float64
1D        float64
Y/A       float64
Fmb       float64
dtype: object

In [31]:
final_df['Player'] = final_df['Player'].str.replace('*','')

final_df['Player'] = final_df['Player'].str.replace('+', '')

final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Player'] = final_df['Player'].str.replace('*','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Player'] = final_df['Player'].str.replace('+', '')


Unnamed: 0,Player,Tm,Att,Yds,TD,1D,Y/A,Fmb
0,Derrick Henry,TEN,378.0,2027.0,17.0,98.0,5.4,3.0
1,Dalvin Cook,MIN,312.0,1557.0,16.0,91.0,5.0,5.0
2,Josh Jacobs,LVR,273.0,1065.0,12.0,61.0,3.9,2.0
3,David Montgomery,CHI,247.0,1070.0,8.0,59.0,4.3,1.0
4,Ezekiel Elliott,DAL,244.0,979.0,6.0,62.0,4.0,6.0
