In [64]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [85]:
url = "https://cfbinfo.com/team/byu-cougars"
session = requests.get(url)
soup = BeautifulSoup(session.text)

In [108]:
# Store all the data
all_data = []

rows = soup.findAll('div', 'bseasonRow')
for row in rows:
    row_data = []
    for _tag in ['bseasonYear', 'bseasonRecord', 'bseasonCoach', 'bseasonConference']:
        row_data.append(row.find('span', _tag).text)
    all_data.append(row_data)
    
df = pd.DataFrame(all_data, columns=['year', 'record', 'coach', 'conference'])
df.head()

Unnamed: 0,year,record,coach,conference
0,1984,13-0 (1.000),LaVell Edwards,WAC
1,1996,14-1 (0.933),LaVell Edwards,WAC
2,1980,12-1 (0.923),LaVell Edwards,WAC
3,1983,11-1 (0.917),LaVell Edwards,WAC
4,1979,11-1 (0.917),LaVell Edwards,WAC


In [109]:
# ‘quicksort’, ‘mergesort’, ‘heapsort’
# Quicksort: worst case is O(N^2).
# http://staff.ustc.edu.cn/~csli/graduate/algorithms/book6/chap08.htm
# Quicksort: find a value Q, split all values into two arrays less than or >= to q
df = df.sort_values('year').reset_index(drop=True)

In [91]:
df.head()

Unnamed: 0,year,record,coach,conference
0,1922,1-5 (0.167),Alvin G. Twitchell,Rocky Mountain Athletic Conference
1,1923,2-5 (0.286),Alvin G. Twitchell,Rocky Mountain Athletic Conference
2,1924,2-3-1 (0.333),Alvin G. Twitchell,Rocky Mountain Athletic Conference
3,1925,3-3 (0.500),Charles J. Hart,Rocky Mountain Athletic Conference
4,1926,1-5-1 (0.143),Charles J. Hart,Rocky Mountain Athletic Conference


In [145]:
def parse_year(url, year):
    """
    Loop through the games for a given year.
    """
    url_yr = url + '/' + year
    session = requests.get(url_yr)
    soup = BeautifulSoup(session.text)
    games = soup.findAll('div', 'gameRow')
    all_games = []
    for game in games:
        game_data = []
        for _tag in ['gameWLT', 'gameDate', 'gameScore', 'gameOpponent', 'gameLocation']:
            game_data.append(game.find('div', _tag).text)
        all_games.append(game_data)

    df_game = pd.DataFrame(all_games, columns=['winloss', 'date', 'score', 'opponent', 'location'])
    df_game['year'] = year
    return df_game

In [146]:
# Loop through all the years
all_years = pd.DataFrame()    
years = df['year'].tolist()
for y in years:
    season = parse_year(url, y)
    all_years = all_years.append(season)

Unnamed: 0,winloss,date,score,opponent,location,year
0,L,"Sat, October 7",3 - 42,Utah St,"Provo, UT",1922
1,L,"Sat, October 14",0 - 49,Utah,"Salt Lake City, UT",1922
2,L,"Tue, October 24",0 - 47,Colorado School of Mines,"Provo, UT",1922
3,W,"Tue, November 14",7 - 0,Wyoming,"Provo, UT",1922
4,L,"Sat, November 25",0 - 33,Colorado St,"Fort Collins, CO",1922
5,L,"Thu, November 30",0 - 13,Wyoming,"Laramie, WY",1922
0,W,"Sat, September 29",16 - 15,Montana St,"Provo, UT",1923
1,L,"Sun, October 7",0 - 41,Colorado,"Boulder, CO",1923
2,L,"Sat, October 13",6 - 14,Colorado St,"Fort Collins, CO",1923
3,L,"Sat, October 27",0 - 15,Utah,"Provo, UT",1923


In [158]:
all_years.tail(20)

Unnamed: 0,winloss,date,score,opponent,location,year
5,L,"Fri, October 6",7 - 24,Boise St,"LaVell Edwards Stadium - Provo, UT",2017
6,L,"Sat, October 14",10 - 35,Mississippi St,"Davis Wade Stadium - Starkville, MS",2017
7,L,"Sat, October 21",17 - 33,East Carolina,"Dowdy-Ficklen Stadium - Greenville, NC",2017
8,W,"Sat, October 28",41 - 20,San Jose St,"LaVell Edwards Stadium - Provo, UT",2017
9,L,"Sat, November 4",13 - 20,Fresno St,"Bulldog Stadium (CA) - Fresno, CA",2017
10,W,"Fri, November 10",31 - 21,UNLV,"Sam Boyd Stadium - Las Vegas, NV",2017
11,L,"Sat, November 18",10 - 16,Massachusetts,"LaVell Edwards Stadium - Provo, UT",2017
12,W,"Sat, November 25",30 - 20,Hawai'i,"Aloha Stadium - Honolulu, HI",2017
0,W,"Sat, September 1",28 - 23,Arizona,"Arizona Stadium - Tucson, AZ",2018
1,,"Sat, September 8",0 - 0,California,"LaVell Edwards Stadium - Provo, UT",2018


In [154]:
# Join the dataframes
byu_data = all_years.merge(df, on='year')

In [156]:
byu_data.to_csv(path_or_buf='byu_seasons.csv', index=False)