# Web scraping using Python and BeautifulSoup

NCAA: 
https://web3.ncaa.org/aprsearch/gsrsearch


#### Collecting web page data

In [1]:
# Install requests and beautifulsoup4
# $ pip install requests
# $ pip install beautifulsoup4

In [2]:
# Import the installed modules
from bs4 import BeautifulSoup
from urllib import request 

In [3]:
import pandas as pd
# read in the html code for parsing in beautiful soup
f= open("ncaa.txt","r")
ncaa=f.read()

In [4]:
# create a bs4 object and use the prettify method from bs4
# This will print data in format like inspecting the web page.
soup = BeautifulSoup(ncaa, 'html.parser') # the input of the BeautifulSoup should be string object or bytes
print(soup.prettify()[:1000])

<html class="js flexbox canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache svg inlinesvg smil svgclippaths" style="">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="8fe4688f-250d-41cd-8944-45837b00efc4" name="_csrf"/>
  <meta content="X-CSRF-TOKEN" name="_csrf_header"/>
  <title>
   Graduation Success Rate
  </title>
  <link href="/aprsearch/css/bootstrap.min.css" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Roboto:400,900,700,500" rel="stylesheet" type="text/css"/>
  <link href="https://fonts.googleapis.com/css?family=Raleway:400,900,700,500" rel="stylesheet" type="text/css

In [5]:
# find all the tables on the web
table = soup.find('table', {'class':'table table-condensed table-responsive table-hover dataTable no-footer'})

In [6]:
# The type of tb should be a bs4.element.Tag object for a single table
# The type of tb should be a bs4.element.Tag object for mutiple tables
type(table)

bs4.element.Tag

In [7]:
print(table)

<table aria-describedby="searchResultsTable_info" class="table table-condensed table-responsive table-hover dataTable no-footer" id="searchResultsTable" role="grid" style="width: 100%;" width="100%">
<thead>
<tr class="titleRow" role="row"><th aria-controls="searchResultsTable" aria-label="Cohort Year: activate to sort column descending" aria-sort="ascending" class="sorting_asc" colspan="1" rowspan="1" style="width: 56px;" tabindex="0">Cohort Year</th><th aria-controls="searchResultsTable" aria-label="School: activate to sort column ascending" class="sorting" colspan="1" rowspan="1" style="width: 275px;" tabindex="0">School</th><th aria-controls="searchResultsTable" aria-label="Conference: activate to sort column ascending" class="sorting" colspan="1" rowspan="1" style="width: 161px;" tabindex="0">Conference</th><th aria-controls="searchResultsTable" aria-label="Sport: activate to sort column ascending" class="sorting" colspan="1" rowspan="1" style="width: 38px;" tabindex="0">Sport</th

In [8]:
i=0
headings=list()
for th in table.find_all('th'):
    # iterate the <th> tags 7 times and get the text of the tag for headings 
    if i < 7:
        headings.append(th.text)
        i+=1
    else:
        break   

In [9]:
headings

['Cohort Year', 'School', 'Conference', 'Sport', 'State', 'GSR', 'FGR']

In [10]:
#### Checking #### 
# Extract the columns we want 
# Use the table that we just stored from last forloop for finding the  <tr> tag: defines a table row
for trs in table.find_all('tr'):
    # find all the <td> tag: defines a table cell
    tds = trs.find_all('td')
    if not tds:     # if tds is not None
        continue
    # get the text from <td> tag and store in a tuple
    print([td.text.strip() for td in tds[:7]])

['2006', 'University of Akron', 'Mid-American Conference', 'Football', 'OH', '58', '52']
['2006', 'Alabama A&M University', 'Southwestern Athletic Conf.', 'Football', 'AL', '40', '45']
['2006', 'Alabama State University', 'Southwestern Athletic Conf.', 'Football', 'AL', '81', '59']
['2006', 'University of Alabama', 'Southeastern Conference', 'Football', 'AL', '73', '57']
['2006', 'University of Alabama at Birmingham', 'Conference USA', 'Football', 'AL', '60', '55']
['2006', 'University at Albany', 'Northeast Conference', 'Football', 'NY', '74', '57']
['2006', 'Alcorn State University', 'Southwestern Athletic Conf.', 'Football', 'MS', '57', '39']
['2006', 'Appalachian State University', 'Southern Conference', 'Football', 'NC', '68', '56']
['2006', 'Arizona State University', 'Pac-12 Conference', 'Football', 'AZ', '67', '52']
['2006', 'University of Arizona', 'Pac-12 Conference', 'Football', 'AZ', '61', '59']
['2006', 'Arkansas State University', 'Sun Belt Conference', 'Football', 'AR', 

In [11]:
# Create an empty list
rows=list()
# Extract the columns we want 
# Use the table that we just stored from last forloop for finding the  <tr> tag: defines a table row
for trs in table.find_all('tr'):
    # find all the <td> tag: defines a table cell
    tds = trs.find_all('td')
    if not tds:     # if tds is not None
        continue
    # get the text from <td> tag and store in variable by row
    rows.append([td.text.strip() for td in tds[:7]])  

In [12]:
import pandas as pd
df=pd.DataFrame(rows, columns= headings)

In [13]:
df

Unnamed: 0,Cohort Year,School,Conference,Sport,State,GSR,FGR
0,2006,University of Akron,Mid-American Conference,Football,OH,58,52
1,2006,Alabama A&M University,Southwestern Athletic Conf.,Football,AL,40,45
2,2006,Alabama State University,Southwestern Athletic Conf.,Football,AL,81,59
3,2006,University of Alabama,Southeastern Conference,Football,AL,73,57
4,2006,University of Alabama at Birmingham,Conference USA,Football,AL,60,55
...,...,...,...,...,...,...,...
237,2006,Elon University,Southern Conference,Football,NC,80,69
238,2006,Gardner-Webb University,Big South Conference,Football,NC,79,65
239,2006,Presbyterian College,Big South Conference,Football,SC,83,65
240,2006,"University of Arkansas, Pine Bluff",Southwestern Athletic Conf.,Football,AR,47,37


In [14]:
# output a list in cells version of csv
df.to_csv('ncaa_2006_cohort.csv', index=False, encoding='utf-8')