In [1]:
import json
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import concurrent.futures

In [2]:
# df = pd.read_csv('data/players_15.csv')
df = pd.read_csv('../data/FIFA22_official_data.csv')

In [3]:
df['player_url'] = 'https://sofifa.com/player/' + df['ID'].astype(str) + '/live'
player_urls = list(df['player_url'])

In [4]:
def get_teams_player_played_for(player_url, idx):
    page_html = requests.get(player_url).content
    page_html_soup = BeautifulSoup(page_html, 'html.parser')
    page_html_soup = page_html_soup.find_all(class_='card double-spacing')
    page_html_soup_new = page_html_soup[0].find_all(class_='col-name text-ellipsis')
    player_teams = set()
    for item in page_html_soup_new:
        if item.find_all('a') != []:    
            team = item.find_all('a')[0].text
            player_teams.add(team)
    return df['Name'][idx],  list(player_teams)

In [5]:
# test get_teams_player_played_for function
idx = 11
result= get_teams_player_played_for(df['player_url'][idx], idx)
result

('Marcos Llorente',
 [' Atlético de Madrid',
  ' Deportivo Alavés',
  ' Real Madrid CF',
  ' Real Madrid Castilla'])

In [6]:
#parallel computation using threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = []
    for idx, url in tqdm(enumerate(player_urls[:])):
        results.append(executor.submit(get_teams_player_played_for, url, idx))
    final_results = []
    for f in concurrent.futures.as_completed(results):
        final_results.append(f.result())

16710it [00:00, 43830.20it/s]


In [7]:
df_small = pd.DataFrame(columns=['player', 'teams'])

In [8]:
player_names = []
player_teams = []
for data in final_results:
    player_name, teams = data
    player_names.append(player_name)
    player_teams.append(teams)

df_small['player'] = player_names
df_small['teams']  = player_teams
df_small.to_pickle('../data/teams_player_has_played_for.pkl')
df_small.to_csv('../data/dteams_player_has_played_for.csv')