# CSCI - Introduction to Programming with Python #


## The TLDR version (see below for step-by-step annotated tutorial) ##

In [2]:
from bs4 import BeautifulSoup as bs4
import requests
import csv

alphabet = "abcdefghijklmnopqrstuvwyz"
player_list = []


for letter in alphabet:
    MLB_players = requests.get("http://www.baseball-reference.com/players/{}/".format(letter))
    MLB_soup = bs4(MLB_players.content, "html.parser")
   
    players = MLB_soup.findAll("p")
    
    for player in players:
        if "(" in player.text:
            player_list.append(player.text)
            
new_player_list = []
for player in player_list:
    player = player.replace('(',"").replace(')',"").replace("-"," ").replace("  ", " ")
    new_player_list.append(player)

split_player_list = []
for player in new_player_list:
    split_player_list.append(player.split(" "))

for career in split_player_list:
    if len(career) == 5:
        career[0] = career[0] + career[1]
        career.remove(career[1])
    
    elif len(career) == 3:
        career.insert(0, career[0])
        
    elif len(career) == 6:
        career[1] = career[1] + career[2] + career[3]
        career.remove(career[2])
        career.remove(career[2])
        
for row in split_player_list:
    row.append((int(row[3]) - int(row[2]))+1)
        
with open("MLB_careers.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerows(split_player_list)

## Annotated Step by Step version

In [2]:
#import these libraries first
from bs4 import BeautifulSoup as bs4
import requests

#request.get grabs html from the url
MLB_players = requests.get("http://www.baseball-reference.com/players/a/")

#this creates a beautifulsoup object
MLB_soup = bs4(MLB_players.content, "html.parser")


### As you can see, the contents of a request.get call is a mess, so we need beautifulsoup to parse the html.

In [4]:
print (MLB_players.content[0:200])

b'<!DOCTYPE html>\n<html data-version="klecko-" data-root="/home/br/build" itemscope itemtype="http://schema.org/WebSite" lang="en" class="no-js" >\n<head>\n    <meta name="viewport" content="width=device-'


In [7]:
#this makes the html much easier to readh
print (MLB_soup.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/br/build" data-version="klecko-" itemscope="" itemtype="http://schema.org/WebSite" lang="en">
 <head>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
  <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/201704241" rel="dns-prefetch"/>
  <link crossorigin="" href="https://d2p3bygnnzw9w3.cloudfront.net" rel="preconnect">
   <link crossorigin="" href="http://d9kjk42l7bfqz.cloudfront.net/req/201607120" rel="preconnect">
    <link crossorigin="" href="https://www.google-analytics.com/" rel="preconnect">
     <link crossorigin="" href="https://www.googletagservices.com" rel="preconnect">
      <script class="allowed">
       var sr_is_production = true;
function vjs_ready(e){"loading"!=document.readyState?e():document.addEventListener("DOMContentLoaded",e)}var log_performance=!1,sr_detect_operaMini=navigator.userAgent.indexOf("Opera Mini")>-1;if(sr_detect_operaMini){var el=document.querySelec

### Next, we use FindAll() to get the content under the correct tag. In this case, the "p" tag.

In [12]:
players = MLB_soup.findAll("p")

#returns the strings inside each of the elements with a "p" tag
for player in players:
    print (player.text)


Professional Baseball Register
Other Personnel
Managers Directory
Bullpen Wiki with bios of many players, managers from MLB and beyond.
Best Known Players
Professional Baseball Register
Other Personnel
Managers Directory
Bullpen Wiki with bios of many players, managers from MLB and beyond.
Best Known Players
David Aardsma  (2004-2015)
Hank Aaron+ (1954-1976)
Tommie Aaron  (1962-1971)
Don Aase  (1977-1990)
Andy Abad  (2001-2006)
Fernando Abad  (2010-2017)
John Abadie  (1875-1875)
Ed Abbaticchio  (1897-1910)
Bert Abbey  (1892-1896)
Charlie Abbey  (1893-1897)
Dan Abbott  (1890-1890)
Fred Abbott  (1903-1905)
Glenn Abbott  (1973-1984)
Jeff Abbott  (1997-2001)
Jim Abbott  (1989-1999)
Kurt Abbott  (1993-2001)
Kyle Abbott  (1991-1996)
Ody Abbott  (1910-1910)
Paul Abbott  (1990-2004)
Al Aber  (1950-1957)
Frank Abercrombie  (1871-1871)
Reggie Abercrombie  (2006-2008)
Bill Abernathie  (1952-1952)
Brent Abernathy  (2001-2005)
Ted Abernathy  (1942-1944)
Ted Abernathy  (1955-1972)
Woody Abernathy  (

### However, this return more text than just the player names, so we need an additional for loop
### to separate the player names. 

In [14]:
player_list = []

#appends each string with a parenthesis to a new list,
for player in players:
    if "(" in player.text:
        player_list.append(player.text)

player_list[:10]

['David Aardsma  (2004-2015)',
 'Hank Aaron+ (1954-1976)',
 'Tommie Aaron  (1962-1971)',
 'Don Aase  (1977-1990)',
 'Andy Abad  (2001-2006)',
 'Fernando Abad  (2010-2017)',
 'John Abadie  (1875-1875)',
 'Ed Abbaticchio  (1897-1910)',
 'Bert Abbey  (1892-1896)',
 'Charlie Abbey  (1893-1897)']

### Now that we've parsed the html to grab only what we need, we need to clean the data to make it easier to analyze.

In [15]:
#removes characters from the strings
new_player_list = []
for player in player_list:
    player = player.replace('(',"").replace(')',"").replace("-"," ").replace("  ", " ")
    new_player_list.append(player)

new_player_list

['David Aardsma 2004 2015',
 'Hank Aaron+ 1954 1976',
 'Tommie Aaron 1962 1971',
 'Don Aase 1977 1990',
 'Andy Abad 2001 2006',
 'Fernando Abad 2010 2017',
 'John Abadie 1875 1875',
 'Ed Abbaticchio 1897 1910',
 'Bert Abbey 1892 1896',
 'Charlie Abbey 1893 1897',
 'Dan Abbott 1890 1890',
 'Fred Abbott 1903 1905',
 'Glenn Abbott 1973 1984',
 'Jeff Abbott 1997 2001',
 'Jim Abbott 1989 1999',
 'Kurt Abbott 1993 2001',
 'Kyle Abbott 1991 1996',
 'Ody Abbott 1910 1910',
 'Paul Abbott 1990 2004',
 'Al Aber 1950 1957',
 'Frank Abercrombie 1871 1871',
 'Reggie Abercrombie 2006 2008',
 'Bill Abernathie 1952 1952',
 'Brent Abernathy 2001 2005',
 'Ted Abernathy 1942 1944',
 'Ted Abernathy 1955 1972',
 'Woody Abernathy 1946 1947',
 'Cliff Aberson 1947 1949',
 'Harry Ables 1905 1911',
 'Shawn Abner 1987 1992',
 'Cal Abrams 1949 1956',
 'George Abrams 1923 1923',
 'Johnny Abrego 1985 1985',
 'Bobby Abreu 1996 2014',
 'Joe Abreu 1942 1942',
 'Jose Abreu 2014 2017',
 'Juan Abreu 2011 2011',
 'Tony Abr

In [16]:
split_player_list = []
for player in new_player_list:
    split_player_list.append(player.split(" "))
    
split_player_list

[['David', 'Aardsma', '2004', '2015'],
 ['Hank', 'Aaron+', '1954', '1976'],
 ['Tommie', 'Aaron', '1962', '1971'],
 ['Don', 'Aase', '1977', '1990'],
 ['Andy', 'Abad', '2001', '2006'],
 ['Fernando', 'Abad', '2010', '2017'],
 ['John', 'Abadie', '1875', '1875'],
 ['Ed', 'Abbaticchio', '1897', '1910'],
 ['Bert', 'Abbey', '1892', '1896'],
 ['Charlie', 'Abbey', '1893', '1897'],
 ['Dan', 'Abbott', '1890', '1890'],
 ['Fred', 'Abbott', '1903', '1905'],
 ['Glenn', 'Abbott', '1973', '1984'],
 ['Jeff', 'Abbott', '1997', '2001'],
 ['Jim', 'Abbott', '1989', '1999'],
 ['Kurt', 'Abbott', '1993', '2001'],
 ['Kyle', 'Abbott', '1991', '1996'],
 ['Ody', 'Abbott', '1910', '1910'],
 ['Paul', 'Abbott', '1990', '2004'],
 ['Al', 'Aber', '1950', '1957'],
 ['Frank', 'Abercrombie', '1871', '1871'],
 ['Reggie', 'Abercrombie', '2006', '2008'],
 ['Bill', 'Abernathie', '1952', '1952'],
 ['Brent', 'Abernathy', '2001', '2005'],
 ['Ted', 'Abernathy', '1942', '1944'],
 ['Ted', 'Abernathy', '1955', '1972'],
 ['Woody', 'Abe

### Unfortunately, the length of each list is not the same because not every single player in the full list has exactly 1 first name and 1 last name.

In [None]:
#takes care of people with middle names by adding first and middle name to one string
for career in split_player_list:
    if len(career) == 5:
        career[0] = career[0] + career[1]
        career.remove(career[1])

#makes player with one name have his first and last name the same
    elif len(career) == 3:
        career.insert(0, career[0])

#combines the strings of players with more than one last name into one string. (Eg. de la cruz --->delacruz)        
    elif len(career) == 6:
        career[1] = career[1] + career[2] + career[3]
        career.remove(career[2])
        career.remove(career[2])

### With that, we have a list of lists with all players with a last name that starts with "A". 

### And now comes the real magic of webscraping. With a few additional lines of code, we can grab the html from each of the pages and add the relevant data to our data set in a matter of seconds. 

In [1]:
alphabet = "abcdefghijklmnopqrstuvwyz"
player_list = []

#this for loop and nested for loop creates a beautiful soup object for each url (a - z) and appends
#the relevant text to an empty list
for letter in alphabet:
    MLB_players = requests.get("http://www.baseball-reference.com/players/{}/".format(letter))
    MLB_soup = bs4(MLB_players.content, "html.parser")
   
    players = MLB_soup.findAll("p")
    
    for player in players:
        if "(" in player.text:
            player_list.append(player.text)

NameError: name 'requests' is not defined

### Run the same code from before to clean the data.

In [None]:
new_player_list = []
for player in player_list:
    player = player.replace('(',"").replace(')',"").replace("-"," ").replace("  ", " ")
    new_player_list.append(player)

split_player_list = []
for player in new_player_list:
    split_player_list.append(player.split(" "))

for career in split_player_list:
    if len(career) == 5:
        career[0] = career[0] + career[1]
        career.remove(career[1])
    
    elif len(career) == 3:
        career.insert(0, career[0])
        
    elif len(career) == 6:
        career[1] = career[1] + career[2] + career[3]
        career.remove(career[2])
        career.remove(career[2])

### Add a column to the end of each row that is the length of the player's career.

In [None]:
#add one to the value of the difference between the starting year and end year.
#otherwise, players who played 1 season will have 0 as the length of their career. 

for row in split_player_list:
    row.append((int(row[3]) - int(row[2]))+1)

### Finally, write the data to a csv file. 

In [None]:
import csv

with open("MLB_careers.csv","w",newline="") as f:
    writer = csv.writer(f)
    writer.writerows(split_player_list)

# Data Analysis

### Rookies

In [3]:
for career in split_player_list:
    if "2017" in career[2]:
        print (career)

['Christian', 'Arroyo', '2017', '2017', 1]
['Barrett', 'Astin', '2017', '2017', 1]
['Rafael', 'Bautista', '2017', '2017', 1]
['Cody', 'Bellinger', '2017', '2017', 1]
['Jorge', 'Bonifacio', '2017', '2017', 1]
['John', 'Bormann', '2017', '2017', 1]
['Johan', 'Camargo', '2017', '2017', 1]
['Shane', 'Carle', '2017', '2017', 1]
['Allen', 'Cordoba', '2017', '2017', 1]
['Dylan', 'Covey', '2017', '2017', 1]
['Stefan', 'Crichton', '2017', '2017', 1]
['Rookie', 'Davis', '2017', '2017', 1]
['ChaseDe', 'Jong', '2017', '2017', 1]
['Miguel', 'Diaz', '2017', '2017', 1]
['Yandy', 'Diaz', '2017', '2017', 1]
['Phil', 'Ervin', '2017', '2017', 1]
['Kyle', 'Freeland', '2017', '2017', 1]
['Jarlin', 'Garcia', '2017', '2017', 1]
['Willy', 'Garcia', '2017', '2017', 1]
['Amir', 'Garrett', '2017', '2017', 1]
['Justin', 'Haley', '2017', '2017', 1]
['Mike', 'Hauschild', '2017', '2017', 1]
['Ariel', 'Hernandez', '2017', '2017', 1]
['Kyle', 'Higashioka', '2017', '2017', 1]
['ChihWei', 'Hu', '2017', '2017', 1]
['Joe'

### Current players

In [4]:
for career in split_player_list:
    if "2017" in career[3]:
        print (career)

['Fernando', 'Abad', '2010', '2017', 8]
['Jose', 'Abreu', '2014', '2017', 4]
['Cristhian', 'Adames', '2014', '2017', 4]
['Lane', 'Adams', '2014', '2017', 4]
['Matt', 'Adams', '2012', '2017', 6]
['Jim', 'Adduci', '2013', '2017', 5]
['Tim', 'Adleman', '2016', '2017', 2]
['Jesus', 'Aguilar', '2014', '2017', 4]
['Nick', 'Ahmed', '2014', '2017', 4]
['Matt', 'Albers', '2006', '2017', 12]
['Arismendy', 'Alcantara', '2014', '2017', 4]
['Raul', 'Alcantara', '2016', '2017', 2]
['Scott', 'Alexander', '2015', '2017', 3]
['Cody', 'Allen', '2012', '2017', 6]
['Abraham', 'Almonte', '2013', '2017', 5]
['Albert', 'Almora', '2016', '2017', 2]
['Yonder', 'Alonso', '2010', '2017', 8]
['Dan', 'Altavilla', '2016', '2017', 2]
['Aaron', 'Altherr', '2014', '2017', 4]
['Jose', 'Altuve', '2011', '2017', 7]
['Dario', 'Alvarez', '2014', '2017', 4]
['Jose', 'Alvarez', '2013', '2017', 5]
['Alexi', 'Amarista', '2011', '2017', 7]
['Brett', 'Anderson', '2009', '2017', 9]
['Chase', 'Anderson', '2014', '2017', 4]
['Tim',

### Hall of Famers

In [5]:
#the original html had a plus for every HOF player, so I just left it in. 
for career in split_player_list:
    if "+" in career[1]:
        print (career)

['Hank', 'Aaron+', '1954', '1976', 23]
['Pete', 'Alexander+', '1911', '1930', 20]
['Roberto', 'Alomar+', '1988', '2004', 17]
['Walter', 'Alston+', '1936', '1936', 1]
['Sparky', 'Anderson+', '1959', '1959', 1]
['Cap', 'Anson+', '1871', '1897', 27]
['Luis', 'Aparicio+', '1956', '1973', 18]
['Luke', 'Appling+', '1930', '1950', 21]
['Richie', 'Ashburn+', '1948', '1962', 15]
['Earl', 'Averill+', '1929', '1941', 13]
['Jeff', 'Bagwell+', '1991', '2005', 15]
['HomeRun', 'Baker+', '1908', '1922', 15]
['Dave', 'Bancroft+', '1915', '1930', 16]
['Ernie', 'Banks+', '1953', '1971', 19]
['Jake', 'Beckley+', '1888', '1907', 20]
['Johnny', 'Bench+', '1967', '1983', 17]
['Chief', 'Bender+', '1903', '1925', 23]
['Yogi', 'Berra+', '1946', '1965', 20]
['Craig', 'Biggio+', '1988', '2007', 20]
['Bert', 'Blyleven+', '1970', '1992', 23]
['Wade', 'Boggs+', '1982', '1999', 18]
['Jim', 'Bottomley+', '1922', '1937', 16]
['Lou', 'Boudreau+', '1938', '1952', 15]
['Roger', 'Bresnahan+', '1897', '1915', 19]
['George',

### Players with long careers

In [8]:
long_careers = []
for career in split_player_list:
    if career[4] > 20:
        long_careers.append(career)
        print (career)
print ("number of players:", len(long_careers))

['Hank', 'Aaron+', '1954', '1976', 23]
['Babe', 'Adams', '1906', '1926', 21]
['Nick', 'Altrock', '1898', '1933', 36]
['Cap', 'Anson+', '1871', '1897', 27]
['Luke', 'Appling+', '1930', '1950', 21]
['Jimmy', 'Austin', '1909', '1929', 21]
['Harold', 'Baines', '1980', '2001', 22]
['Miguel', 'Batista', '1992', '2012', 21]
['BoomBoom', 'Beck', '1924', '1945', 22]
['Chief', 'Bender+', '1903', '1925', 23]
['Bert', 'Blyleven+', '1970', '1992', 23]
['Barry', 'Bonds', '1986', '2007', 22]
['George', 'Brett+', '1973', '1993', 21]
['Dan', 'Brouthers+', '1879', '1904', 26]
['Bill', 'Buckner', '1969', '1990', 22]
['Guy', 'Bush', '1923', '1945', 23]
['Earl', 'Caldwell', '1928', '1948', 21]
['Fred', 'Carisch', '1903', '1923', 21]
['Steve', 'Carlton+', '1965', '1988', 24]
['Phil', 'Cavarretta', '1934', '1955', 22]
['Fred', 'Clarke+', '1894', '1915', 22]
['Roger', 'Clemens', '1984', '2007', 24]
['Ty', 'Cobb+', '1905', '1928', 24]
['Eddie', 'Collins+', '1906', '1930', 25]
['Bartolo', 'Colon', '1997', '2017

### Average length of career

In [12]:
import numpy as np

len_career = []
for row in split_player_list:
#     if int(row[2]) > 2000:
    len_career.append(row[4])
    
print (np.mean(len_career))

5.74736398144
