In [13]:
import requests
from bs4 import BeautifulSoup

class PlayerProfileScraper:
    def __init__(self, full_name):
        self.full_name = full_name
        self.full_name_for_url = full_name.replace(' ', '+')
        self.base_url = f"https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query={self.full_name_for_url}"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

    def fetch_search_results(self):
        """Perform HTTP request to get search results."""
        response = requests.get(self.base_url, headers=self.headers)
        if response.status_code != 200:
            print(f"HTTP request error: {response.status_code}")
            return None
        return response.text

    def parse_first_table(self, html_content):
        """Parse HTML content to retrieve only the player's name, position, and age."""
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table', class_='items')
        
        if table is None:
            print("No table found.")
            return None

        # Extract the first row of the table
        first_row = table.find('tbody').find('tr')
        if first_row is None:
            print("No rows found in the table.")
            return None

        # Extract Name
        name_cell = first_row.find('td')
        print(name_cell)
        player_name = name_cell.find('a', title=True).text if name_cell.find('a', title=True) else "Unknown"

        # Extract Position (5th column)
        position = first_row.find_all('td')[4].text.strip()

        # Extract Age (7th column)
        age = first_row.find_all('td')[6].text.strip()

        # Print extracted data
        print("\nPlayer Information:")
        print(f"Name: {player_name}")
        print(f"Position: {position}")
        print(f"Age: {age}")
        
        return {
            "name": player_name,
            "position": position,
            "age": age
        }

    def save_player_profile(self):
        """Main method to fetch player profile and print player info."""
        html_content = self.fetch_search_results()
        if html_content:
            player_info = self.parse_first_table(html_content)
            if player_info:
                print(player_info)

if __name__ == "__main__":
    # Replace 'Player Name' with the name of the player you want to search for
    player_name = "Edinson Cavani"  
    scraper = PlayerProfileScraper(player_name)
    scraper.save_player_profile()


<td><table class="inline-table"><tr><td rowspan="2"><a href="#"><img alt="Edinson Cavani" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/48280-1619791055.jpg?lm=1" title="Edinson Cavani"/></a></td><td class="hauptlink"><a href="/edinson-cavani/profil/spieler/48280" title="Edinson Cavani">Edinson Cavani</a></td></tr><tr><td><a href="/club-atletico-boca-juniors/startseite/verein/189" title="CA Boca Juniors">CA Boca Juniors</a></td></tr></table></td>

Player Information:
Name: Edinson Cavani
Position: CF
Age: 37
{'name': 'Edinson Cavani', 'position': 'CF', 'age': '37'}


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import asyncio

# Load the provided CSV file
file_path = '../../data/cleaned_scouting_report copy.csv'
scouting_report_df = pd.read_csv(file_path)

# Define the scraper class
class PlayerProfileScraper:
    def __init__(self, full_name):
        self.full_name = full_name
        self.full_name_for_url = full_name.replace(' ', '+')
        self.base_url = f"https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query={self.full_name_for_url}"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

    def fetch_search_results(self):
        """Perform HTTP request to get search results."""
        try:
            response = requests.get(self.base_url, headers=self.headers)
            if response.status_code != 200:
                return None
            return response.text
        except requests.RequestException:
            return None

    def parse_first_table(self, html_content):
        """Parse HTML content to retrieve only the player's name, position, and age."""
        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table', class_='items')
        if not tables:
            return {"name": self.full_name, "position": "XX", "age": "XX"}
        
        table = tables[0]
        first_row = table.find('tbody').find('tr')
        if first_row is None:
            return {"name": self.full_name, "position": "XX", "age": "XX"}

        try:
            name_cell = first_row.find('td')
            player_name = name_cell.find('a', title=True).text if name_cell.find('a', title=True) else "Unknown"
            position = first_row.find_all('td')[4].text.strip() if len(first_row.find_all('td')) > 4 else "XX"
            age = first_row.find_all('td')[6].text.strip() if len(first_row.find_all('td')) > 6 else "XX"
        except IndexError:
            position, age = "XX", "XX"

        return {
            "name": player_name,
            "position": position,
            "age": age
        }

# Ensure 'Position' and 'Age' columns exist
if 'Age' not in scouting_report_df.columns:
    scouting_report_df['Age'] = None

# Define async function to enable await in retry
async def scrape_player_data(player_name, max_retries=3):
    scraper = PlayerProfileScraper(player_name)
    retries = 0
    player_info = {"position": "XX", "age": "XX"}
    
    while (player_info["position"] == "XX" or player_info["age"] == "XX") and retries < max_retries:
        html_content = scraper.fetch_search_results()
        if html_content:
            player_info = scraper.parse_first_table(html_content)
        retries += 1
        if player_info["position"] == "XX" or player_info["age"] == "XX":
            await asyncio.sleep(1)  # Await async sleep for retry

    return player_info

# Loop through each player and update Position and Age
async def update_scouting_report():
    for idx, row in scouting_report_df.iterrows():
        player_name = row['player_name']
        player_info = await scrape_player_data(player_name)
        scouting_report_df.at[idx, 'Position'] = player_info['position']
        scouting_report_df.at[idx, 'Age'] = player_info['age']
        
        # Log the result
        print(player_name, '-', player_info['position'], '-', player_info['age'])
        await asyncio.sleep(1)  # Respectful delay for new request

# Run the update function
await update_scouting_report()

# Save the updated dataframe
scouting_report_df[['player_name', 'Position', 'Age']].to_csv('../../data/updated_scouting_report.csv', index=False)
print("Updated scouting report saved to 'updated_scouting_report.csv'")


Denis Zakaria - DM - 27
Eliesse Ben Seghir - LW - 19
Vanderson - RB - 23
Thilo Kehrer - CB - 28
Takumi Minamino - LW - 29
Lamine Camara - CM - 20
Folarin Balogun - CF - 23
Maghnes Akliouche - RW - 22
Mohammed Salisu - CB - 25
Wilfried Singo - CB - 23
Breel Embolo - CF - 27
Caio Henrique Oliveira Silva - LB - 27
Aleksandr Golovin - AM - 28
Ismail Jakobs - LB - 25
Soungoutou Magassa - DM - 21
Jordan Teze - RB - 25
Christian Mawissa - CB - 19
Kassoum Ouattara - LB - 20
George Ilenikhena - CF - 18
Krépin Diatta - RW - 25
Guillermo Maripán - CB - 30
Guillermo Maripán - CB - 30
Willian Pacho - CB - 23
Achraf Hakimi - RB - 25
João Neves - DM - 20
Warren Zaïre-Emery - CM - 18
Bradley Barcola - LW - 22
Marquinhos - CB - 30
Lucas Beraldo - CB - 20
Ousmane Dembélé - RW - 27
Lee Kang-in - XX - XX
Vitinha - CM - 24
Marco Asensio - RW - 28
Fabián Ruiz - CM - 28
Nuno Mendes - LB - 22
Randal Kolo Muani - CF - 25
Désiré Doué - LW - 19
Milan Škriniar - CB - 29
Gonçalo Ramos - CF - 23
Carlos Soler - CM -

In [9]:
import pandas as pd

# Load both CSV files
updated_report_path = '../../data/updated_scouting_report.csv'
cleaned_report_path = '../../data/cleaned_scouting_report copy.csv'

updated_scouting_df = pd.read_csv(updated_report_path)
cleaned_scouting_df = pd.read_csv(cleaned_report_path)

# Merge the dataframes on 'player_name' with 'Position' and 'Age' from the updated report
merged_df = cleaned_scouting_df.merge(
    updated_scouting_df[['player_name', 'Position', 'Age']],
    on='player_name',
    how='left',
    suffixes=('', '_updated')
)

# Update 'Position' column with values from 'Position_updated' and add 'Age'
merged_df['Position'] = merged_df['Position_updated'].combine_first(merged_df['Position'])
merged_df.drop(columns=['Position_updated'], inplace=True)

# Save the merged result
merged_report_path = '../../data/cleaned_scouting_report_with_age.csv'
merged_df.to_csv(merged_report_path, index=False)

print(f"Merged report saved to '{merged_report_path}'")


Merged report saved to '../../data/cleaned_scouting_report_with_age.csv'
