In [None]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import pickle

<h1>Scraping www.soccernews.com for transfers

In [None]:
def url_to_names_and_flags_prices(url):

  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  elements = soup.find_all(class_='player-deals', recursive=True)
  names = []
  for td in elements:
    # Find the h4 element within the td
    h4_element = td.find('h4')
    if h4_element:
        # Extract the text of the h4 element
        name = h4_element.get_text(strip=True)
        names.append(name)

  elements = soup.find_all(class_='coutry-flag', recursive=True)
  flags = []
  for td in elements:
    flag = td.img['alt']
    flags.append(flag)

  elements = soup.find_all(class_='price-status', recursive=True)
  prices = []
  for td in elements:
    price = td.text
    prices.append(price)
  return [names[:-10], flags[:-10], prices[:-10]]

In [None]:
def url_to_df(main_url):

  urls = [main_url]
  for i in range(2009, 2023):
    url = main_url[:-1] + '-{}-{}'.format(i, i+1) + '/'
    urls.append(url)

  names = []
  flags = []
  years = []
  prices = []

  for url in urls:
    res = url_to_names_and_flags_prices(url)
    year = url[-10:-1]
    year_ls = [year] * len(res[0])

    names += res[0]
    flags += res[1]
    prices += res[2]
    years += year_ls

  df = pd.DataFrame({'Name': names, 'Country': flags, 'Price':prices, 'Year':years})

  return df

In [None]:
links = ['https://www.soccernews.com/soccer-transfers/english-premier-league-transfers/',
         'https://www.soccernews.com/soccer-transfers/italian-serie-a-transfers/',
         'https://www.soccernews.com/soccer-transfers/german-bundesliga-transfers/',
         'https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers/'
         ]

names = ['Premier League', 'Seria A', 'Bundesliga', 'La Liga']

dfs = []
for i in range(len(links)):
  df = url_to_df(links[i])
  df['League'] = names[i]
  df['Year'] = df['Year'].replace('transfers', '2023-2024')
  dfs.append(df)

df_all = dfs[0]
for i in range(1, len(dfs)):
  df_all = pd.concat([df_all, dfs[i]], axis = 0)

In [None]:
df_all.head()

Unnamed: 0,Name,Country,Price,Year,League
0,Mason Holgate,England,Loan,2023-2024,Premier League
1,Maxime Esteve,France,Loan,2023-2024,Premier League
2,Adam Wharton,England,21.1million,2023-2024,Premier League
3,Rodrigo Ribeiro,Portugal,Loan,2023-2024,Premier League
4,Joe Gauci,Australia,1.5million,2023-2024,Premier League


In [None]:
country_abbreviations = {
    'England': 'ENG',
    'France': 'FRA',
    'Portugal': 'POR',
    'Australia': 'AUS',
    'Albania': 'ALB',
    'Belgium': 'BEL',
    'Turkey': 'TUR',
    'United States': 'USA',
    'Japan': 'JPN',
    'Colombia': 'COL',
    'Croatia': 'CRO',
    'Sweden': 'SWE',
    'Argentina': 'ARG',
    'Serbia': 'SRB',
    'Spain': 'ESP',
    'Ivory Coast': 'CIV',
    'Romania': 'ROU',
    'Germany': 'GER',
    'Chile': 'CHI',
    'Netherlands': 'NED',
    'Greece': 'GRE',
    'Wales': 'WAL',
    'Nigeria': 'NGA',
    'Senegal': 'SEN',
    'Ireland': 'IRL',
    'Morocco': 'MAR',
    'Uruguay': 'URU',
    'Brazil': 'BRA',
    'Paraguay': 'PAR',
    'Cameroon': 'CMR',
    'Ghana': 'GHA',
    'Italy': 'ITA',
    'Ecuador': 'ECU',
    'Scotland': 'SCO',
    'Mexico': 'MEX',
    'Norway': 'NOR',
    'Denmark': 'DEN',
    'Burkina Faso': 'BFA',
    'Hungary': 'HUN',
    'Zimbabwe': 'ZIM',
    'Switzerland': 'SUI',
    'Northern Ireland': 'NIR',
    'Tunisia': 'TUN',
    'Israel': 'ISR',
    'Venezuela': 'VEN',
    'Togo': 'TOG',
    'Egypt': 'EGY',
    'Jamaica': 'JAM',
    'Algeria': 'ALG',
    'DR Congo': 'COD',
    'South Korea': 'KOR',
    'Rep. of Ireland': 'IRL',
    'Bosnia': 'BIH',
    'Curacao': 'CUW',
    'Czech Rep.': 'CZE',
    'Austria': 'AUT',
    'Poland': 'POL',
    'Lithuania': 'LTU',
    'New Zealand': 'NZL',
    'Mali': 'MLI',
    'Costa Rica': 'CRC',
    'Benin': 'BEN',
    'Spain ': 'ESP',
    'Gabon': 'GAB',
    'Estonia': 'EST',
    'Iceland': 'ISL',
    'Armenia': 'ARM',
    'Ukraine': 'UKR',
    'Kenya': 'KEN',
    'Slovakia': 'SVK',
    'Bermuda': 'BER',
    'Montenegro': 'MNE',
    'China': 'CHN',
    'Bosnia and Herzegovina': 'BIH',
    'Congo': 'CGO',
    'Slovenia': 'SVN',
    'Czech Republic': 'CZE',
    'Iran': 'IRN',
    'South Africa': 'RSA',
    'Republic of Tanzania': 'TAN',
    'Azerbaijan': 'AZE',
    'Greece ': 'GRE',
    'Zambia': 'ZAM',
    'Portugal ': 'POR',
    'Korea': 'KOR',
    'Georgia': 'GEO',
    'Cabo Verde': 'CPV',
    'Angola': 'ANG',
    'Uzbekistan': 'UZB',
    'Sierra Leone': 'SLE',
    'Guinea': 'GUI',
    'Bulgaria': 'BUL',
    'Finland': 'FIN',
    'FYR Macedonia': 'MKD',
    'Libya': 'LBY',
    'Iraq': 'IRQ',
    'Italy ': 'ITA',
    'Cyprus': 'CYP',
    'Moldova': 'MDA',
    'Liechtenstein': 'LIE',
    'Guadeloupe': 'GLP',
    'Kosovo': 'KVX',
    'Gambia': 'GAM',
    'Russia': 'RUS',
    'Martinique': 'MTQ',
    'Cape Verde': 'CPV',
    'loan': 'LOA',
    'France ': 'FRA',
    'Latvia': 'LAT',
    'Gergia': 'GGA',
    'Philippines': 'PHL',
    'North Macedonia': 'MKD',
    'Peru': 'PER',
    'USA': 'USA',
    'Switzerland ': 'SUI',
    'French Guiana': 'GUF',
    'Ukraine ': 'UKR',
    'Canada': 'CAN',
    'Poland ': 'POL',
    'Panama': 'PAN',
    'Honduras': 'HON',
    'Mozambique': 'MOZ',
    'Syria': 'SYR',
    'Dominican Republic': 'DOM',
    'Suriname': 'SUR',
    'Bosnia ': 'BIH',
    'Ukraina': 'UKR',
    'Qatar': 'QAT',
    'Saudi Arabia': 'SAU',
    'Guinea-Bissau': 'GNB',
    'Gabor': 'GAB'
}

In [None]:
df_all['Country_Code'] = df_all['Country'].map(country_abbreviations)

In [None]:
df_all = df_all.dropna()

In [None]:
df_all.head()

Unnamed: 0,Name,Country,Price,Year,League,Country_Code
0,Mason Holgate,England,Loan,2023-2024,Premier League,ENG
1,Maxime Esteve,France,Loan,2023-2024,Premier League,FRA
2,Adam Wharton,England,21.1million,2023-2024,Premier League,ENG
3,Rodrigo Ribeiro,Portugal,Loan,2023-2024,Premier League,POR
4,Joe Gauci,Australia,1.5million,2023-2024,Premier League,AUS


In [None]:
df_all.shape

(8158, 6)

In [None]:
df_all.to_csv('df_transfers.csv', index = False, encoding="utf-8-sig")
