In [3]:
import requests
from bs4 import BeautifulSoup

#Step1: create a variable called headers to tell the website that we are a browser and not a scraping tool
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

#Global Variables
url = "https://www.transfermarkt.us/premier-league/startseite/wettbewerb/GB1/plus/?saison_id="

def collect_data(year):
    #Step 2: assigns the address of the page we need to scrape to a string
    link = url + str(year)

    try:
        # Step 3: uses the requests library to grab the code of a page and assign it to 'PageTree'
        pageTree = requests.get(link, headers=headers)

        # Step 4: parses the website code into html and we will be able to search through this for the data we want to extract
        pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

        # Step 5: Extracting team names
        team_names_raw = pageSoup.find_all("td", {"class": "hauptlink no-border-links"})
        team_names = []
        for i in range(20):
            if(team_names_raw[i].text[len(team_names_raw[i].text)-1:len(team_names_raw[i].text)] == '\xa0'):
                team_names.append((team_names_raw[i].text[:len(team_names_raw[i].text)-2]).lower())
            else:
                team_names.append((team_names_raw[i].text[0:len(team_names_raw[i].text) - 1]).lower())

        #Step 6: Extracting Squad Size, Average Age, and Number of Foreigners
        #they are all stored in the "zentriert" class in the website
        squad_age_foreigners_raw = pageSoup.find_all("td", {"class": "zentriert"})
        squad_size = []
        average_age = []
        num_foreigners = []
        i = 4
        while(len(squad_size) < 20):
            squad_size.append(int(squad_age_foreigners_raw[i].text))
            i += 1
            average_age.append(float(squad_age_foreigners_raw[i].text))
            i += 1
            num_foreigners.append(int(squad_age_foreigners_raw[i].text))
            i += 2

        squad_age_foreigners_raw = squad_age_foreigners_raw[i:]

        #Step 7: Extracting Average Market Value and Total Market Value
        market_values_raw = pageSoup.find_all("td", {"class": "rechts"})
        average_market_value = [] #in millions
        total_market_value = [] #in millions
        i = 2
        while(len(average_market_value) < 20):
            #getting the average market value
            amv = market_values_raw[i].text
            if(amv[len(amv)-1:] == "m"):
                amv = float(amv[1:len(amv)-1])
            elif(amv[len(amv)-1:] == "k"):
                amv = float(amv[1:len(amv)-1]) * 0.001
            average_market_value.append(amv)

            #iterating
            i += 1

            #getting the total market value
            tmv = market_values_raw[i].text
            if(tmv[len(tmv) - 1:] == "n"):
                tmv = float(tmv[1:len(tmv) - 2]) * 1000
            elif(tmv[len(tmv) - 1:] == "m"):
                tmv = float(tmv[1:len(tmv) - 1])
            total_market_value.append(tmv)

            #iterating
            i += 1

        #Step 8: Extracting Position
        positions_raw = pageSoup.find_all("td", {"class": "rechts hauptlink"})
        positions = []
        for i in range(len(positions_raw)):
                positions.append(int(positions_raw[i].text))

        ranking_table_raw = pageSoup.find_all("td", {"class": "no-border-links hauptlink"})
        ranking_table = []
        for i in range(len(ranking_table_raw)):
            if(ranking_table_raw[i].text[len(ranking_table_raw[i].text)-2:len(ranking_table_raw[i].text)-1] == '\xa0'):
                ranking_table.append((ranking_table_raw[i].text[1:len(ranking_table_raw[i].text) - 2]).lower())
            else:
                ranking_table.append((ranking_table_raw[i].text[1:len(ranking_table_raw[i].text)-1]).lower())


        #Step 9: Get Goal Difference and Points
            #the rest of the league table is in the "zentriert" class
            #which was stored previously in squad_age_foreigners_raw
        goal_difference = []
        points = []
        i = len(squad_age_foreigners_raw) - 1

        while(len(goal_difference) < 20 and i > 5):
            points.insert(0, int(squad_age_foreigners_raw[i].text))
            i -= 1

            goal_difference.insert(0, int(squad_age_foreigners_raw[i].text))
            i -= 3

        #Step 10: Some Data Cleansing
        #mapping short names to full names
        #team names is by value - long form
        #ranking table is by rank - short form

        final_goal_difference = []
        final_positions = []
        final_points = []
        for i in range(20):
            final_goal_difference.append(0)
            final_positions.append(0)
            final_points.append(0)

        #run through number 1, most exhaustive link
        for i in range(len(ranking_table)):
            all_names = ranking_table[i].split()
            for j in range(len(team_names)):
                match = True
                for k in all_names:
                    if(not(k in team_names[j])):
                        match = False
                if(match):
                    #add to ordered lists
                    final_goal_difference[j] = goal_difference[i]
                    final_positions[j] = positions[i]
                    final_points[j] = points[i]
                    ranking_table[i] = ""

        #run through number two, test first three letters
        for i in range(len(ranking_table)):
            if(ranking_table[i] != ""):
                all_names = ranking_table[i][0:3]
            else:
                continue
            for j in range(len(team_names)):
                if(all_names in team_names[j] and final_positions[j] == 0):
                    final_goal_difference[j] = goal_difference[i]
                    final_positions[j] = positions[i]
                    final_points[j] = points[i]
                    ranking_table[i] = ""

        #Step 11: Creating a year list
        year_list = []
        for i in range(20):
            year_list.append(year)

        #All Data is Collected
        #test
        print("END OF DATA COLLECTION-------------------------------------------")
        print("Year: ", year_list, len(year_list))
        print("Team Names: ", team_names, len(team_names))
        print("Position: ", final_positions, len(final_positions))
        print("Goal Difference: ", goal_difference, len(goal_difference))
        print("Points: ", points, len(points))
        print("Squad Size: ", squad_size, len(squad_size))
        print("Average Age: ", average_age, len(average_age))
        print("Num Foreigners: ", num_foreigners, len(num_foreigners))
        print("Average Market Value: ", average_market_value, len(average_market_value))
        print("Total Market Value: ", total_market_value, len(total_market_value))
        print("-----------------------------------------------------------------")
        #Year,Team,Position,Goal Difference,Points,Squad Size,Average Age,Number of Foreigners,Average Market Value, Total Market Value
        #Step 12: write to a csv for data from the 2004/05 season to the 2021/22 season which is 18 years of data!
        textfile = open("BigPositionVsValue.csv", "a")
        for i in range(20):
            textfile.write(str(year_list[i]) + "," + str(team_names[i]) + "," + str(final_positions[i]) + "," + str(goal_difference[i]) + "," + str(points[i]) + "," + str(squad_size[i]) + "," + str(average_age[i]) + "," + str(num_foreigners[i]) + "," + str(average_market_value[i]) + "," + str(total_market_value[i]) + "\n")
        textfile.close()
    except:
        print("Improper URL entered!")

#Main
print("This program will populate BigPositionVsValue.csv with data from season 2017/18 to season 2022/23.")
command = input("Enter anything to start:")
if(command != ""):
    year = 2023
    while(year < 2024):
        collect_data(year)
        year += 1

This program will populate BigPositionVsValue.csv with data from season 2017/18 to season 2022/23.
Enter anything to start:hi
END OF DATA COLLECTION-------------------------------------------
Year:  [2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023] 20
Team Names:  ['manchester city', 'arsenal fc', 'chelsea fc', 'liverpool fc', 'tottenham hotspur', 'manchester united', 'aston villa', 'newcastle united', 'brighton & hove albion', 'west ham united', 'brentford fc', 'crystal palace', 'nottingham forest', 'afc bournemouth', 'everton fc', 'wolverhampton wanderers', 'fulham fc', 'burnley fc', 'sheffield united', 'luton town'] 20
Position:  [2, 1, 9, 3, 5, 6, 4, 7, 12, 8, 16, 14, 17, 10, 15, 11, 13, 19, 20, 18] 20
Goal Difference:  [57, 50, 41, 21, 15, 1, 19, -9, 4, -8, -7, -5, -4, -12, -11, -8, -20, -29, -32, -63] 20
Points:  [80, 79, 75, 67, 60, 54, 53, 49, 48, 48, 46, 44, 43, 40, 36, 35, 26, 25, 24, 16] 20
Squad Size:  [