In [128]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import os

def scrape_player_profiles(url, output_folder, player_stats ):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', class_='wf-table')

    # Initialize lists to store the data
    agent_data = []
    headers = []
    player_data = []

    for th in table.find_all('th'):
        title = th.get('title')
        if title:  
            headers.append(title)

    for row in table.find_all('tr')[1:]:  
        cols = row.find_all('td')
        if cols: 
            row_data = {}
            for i, col in enumerate(cols):
                if headers[i] == "Agent":  
                    img = col.find('img')
                    if img: 
                        row_data[headers[i]] = img.get('alt', 'Unknown')
                    else:
                        row_data[headers[i]] = 'Unknown'  
                else:
                    text = col.get_text(strip=True)
                    row_data[headers[i]] = text

            agent_data.append(row_data)

    current_team_section = None
    for header in soup.find_all('h2', class_='wf-label mod-large'):
        header_text = header.get_text(strip=True)

        if header_text == "Current Teams":
            current_team_section = header.find_next('div', class_='wf-card')
            break  

    team_name = "Unknown"

    if current_team_section:
        team_name_div = current_team_section.find('div', {'style': 'font-weight: 500;'})

        if team_name_div:
            team_name = team_name_div.get_text(strip=True)
            #print(f"{team_name}")




    player_id = url.split('/')[4] 
    player_name = url.split('/')[5]  
    player_name = player_name.split('?')[0] 
    player_name = player_name.replace('/', '') 
    #print(player_name)
    
    df = pd.DataFrame(agent_data)
    df['Rounds by Agents'] = df['Usage'].str.extract(r'\((\d+)\)')[0].astype(int)  
    df['Usage'] = df['Usage'].str.replace(r'\(\d+\)\s*', '', regex=True)  
    df['Usage'] = df['Usage'].str.replace('%', '', regex=True).astype(float)  

    df = df[df['Usage'] >= 5]  
    Total_Games = int(df['Rounds by Agents'].astype(int).sum())
    first_5_agents = df['Agent'].head(5).tolist() if not df.empty else []
    df_to_save = df.to_dict(orient='records')    
    
    stats = {
        "Rounds Played": player_stats.get('Rounds Played', None),
        "Rating": player_stats.get('Rating', None),
        "Average Combat Score": player_stats.get('Average Combat Score', None),
        "Kills:Death": player_stats.get('Kills:Death', None),
        "Kill, Assist, Survive, Trade Percent": player_stats.get('Kill, Assist, Survive, Trade Percent', None),
        "Average Damage per Round": player_stats.get('Average Damage per Round', None),
        "Kills Per Round": player_stats.get('Kills Per Round', None),
        "Assists Per Round": player_stats.get('Assists Per Round', None),
        "First Kills Per Round": player_stats.get('First Kills Per Round', None),
        "First Deaths Per Round": player_stats.get('First Deaths Per Round', None),
        "Headshot Percent": player_stats.get('Headshot Percent', None),
        "Clutch Success Percent%": player_stats.get('Clutch Success Percent', None),
        "Clutches (won/played)": player_stats.get('CL', None),
        "Maximum Kills in a single map": player_stats.get('Maximum Kills in a single map', None),
        "Kills": player_stats.get('Kills', None),
        "Deaths": player_stats.get('Deaths', None),
        "Assists": player_stats.get('Assists', None),
        "First Kills": player_stats.get('First Kills', None),
        "First Deaths": player_stats.get('First Deaths', None),
    }

    
    json_output = {
        "Player URL": url,
        "All Agents Stats": df.to_dict(orient='records'),
        "Current Team": team_name,
        "Player": player_name,
        "Signature Agents Ranked by Usage": first_5_agents,
        "Total games played": Total_Games,
        "Player stats": stats
    }
    
    os.makedirs(output_folder, exist_ok=True)

    json_filename = os.path.join(output_folder, f"{player_name.replace(' ', '_')}_{player_id}_profile.json")
    
    
    
    try:
        with open(json_filename, 'w') as json_file:
            json.dump(json_output, json_file, indent=4)
        #print(f"Data has been successfully saved to {json_filename}")
    except Exception as e:
        print(f"Error saving JSON: {e}")


    

    


def scrape_profiles_from_cleaned_data(df, output_folder):
    for index, player_row in df.iterrows():
        player_url = player_row.iloc[-1] 
        if pd.notna(player_url): 
            player_stats = {
                "Rounds Played": player_row['Rounds Played'],
                "Rating": player_row['Rating'],
                "Average Combat Score": player_row['Average Combat Score'],
                "Kills:Death": player_row['Kills:Death'],
                "Kill, Assist, Survive, Trade Percent": player_row['Kill, Assist, Survive, Trade Percent'],
                "Average Damage per Round": player_row['Average Damage per Round'],
                "Kills Per Round": player_row['Kills Per Round'],
                "Assists Per Round": player_row['Assists Per Round'],
                "First Kills Per Round": player_row['First Kills Per Round'],
                "First Deaths Per Round": player_row['First Deaths Per Round'],
                "Headshot Percent": player_row['Headshot Percent'],
                "Clutch Success Percent": player_row['Clutch Success Percent'],
                "Clutches (won/played)": player_row['Clutches (won/played)'],
                "Kills Max": player_row['Maximum Kills in a single map'],
                "Kills": player_row['Kills'],
                "Deaths": player_row['Deaths'],
                "Assists": player_row['Assists'],
                "First Kills": player_row['First Kills'],
                "First Deaths": player_row['First Deaths'],
            }
            scrape_player_profiles(player_url, output_folder, player_stats) 

output_folder = 'player_profiles'  # Base output folder

for event_name, df in all_dataframes.items():          
    event_output_folder = os.path.join(output_folder, event_name.replace(' ', '_'))  
    os.makedirs(event_output_folder, exist_ok=True) 
    scrape_profiles_from_cleaned_data(df, event_output_folder)

In [124]:
print(type(all_dataframes))  # Check the type
print(len(all_dataframes))  # Check the length
for item in all_dataframes:
    print(item) 

<class 'dict'>
6
Valorant Game Changers 2023
Valorant Game Changers 2024
Challengers League 2023
Challengers League 2024
Valorant Champions Tour 2023
Valorant Champions Tour 2024


In [126]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_event_data(event_name, event_id):
    url = f'https://www.vlr.gg/stats/?event_group_id={event_id}&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all'
    print(f'Scraping data from: {url}')
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    table = soup.find('table', class_='wf-table mod-stats mod-scroll')
    if table is None:
        print(f"No table found for {event_name}. Check the page structure or event ID.")
        return None  
    
    headers = [th.text.strip() for th in table.find_all('th')]
    headers.append('Player URL') 
    
    rows = []
    for row in table.find_all('tr')[1:]: 
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]

        player_link = row.find('a', href=True) 
        if player_link:
            player_url = 'https://www.vlr.gg' + player_link['href'] + '?timespan=all'
            player_name = player_link.text.strip()  
            player_name = player_name.replace('\n', '').strip()
        else:
            player_url = 'N/A'
            player_name = 'N/A'
        
        row_data.append(player_url) 
        
      
        rows.append(row_data)

    for r in rows:
        if len(r) != len(headers):
            print(f'Warning: Row length {len(r)} does not match header length {len(headers)}.')

    df = pd.DataFrame(rows, columns=headers)
    
    if 'Agent' in df.columns:
        df.drop(columns=['Agent'], inplace=True)
    
    if 'Player' in df.columns:  
        df.drop(columns=['Player'], inplace=True)



    column_mapping = {
        'Player': 'Player',
        'Agents': 'Agents',
        'Rnd': 'Rounds Played',
        'R2.0': 'Rating',
        'ACS': 'Average Combat Score',
        'K:D': 'Kills:Death',
        'KAST': 'Kill, Assist, Survive, Trade Percent',
        'ADR': 'Average Damage per Round',
        'KPR': 'Kills Per Round',
        'APR': 'Assists Per Round',
        'FKPR': 'First Kills Per Round',
        'FDPR': 'First Deaths Per Round',
        'HS%': 'Headshot Percent',
        'CL%': 'Clutch Success Percent',
        'CL': 'Clutches (won/played)',
        'KMax': 'Maximum Kills in a single map',
        'K': 'Kills',
        'D': 'Deaths',
        'A': 'Assists',
        'FK': 'First Kills',
        'FD': 'First Deaths',
        'Player URL': 'Player URL'
    }

    df.rename(columns=column_mapping, inplace=True)

    return df 

events = {
    'Valorant Game Changers 2023': '38',
    'Valorant Game Changers 2024': '62',
    'Challengers League 2023': '31',
    'Challengers League 2024': '59',
    'Valorant Champions Tour 2023': '45',
    'Valorant Champions Tour 2024': '61'
}

all_dataframes = {} 

for event_name, event_id in events.items():
    event_df = scrape_event_data(event_name, event_id) 
    if event_df is not None: 
        all_dataframes[event_name] = event_df



Scraping data from: https://www.vlr.gg/stats/?event_group_id=38&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all
Scraping data from: https://www.vlr.gg/stats/?event_group_id=62&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all
Scraping data from: https://www.vlr.gg/stats/?event_group_id=31&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all
Scraping data from: https://www.vlr.gg/stats/?event_group_id=59&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all
Scraping data from: https://www.vlr.gg/stats/?event_group_id=45&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all
Scraping data from: https://www.vlr.gg/stats/?event_group_id=61&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all


In [127]:
print(all_dataframes)

{'Valorant Game Changers 2023':     Agents Rounds Played Rating Average Combat Score Kills:Death  \
0     (+1)           986   1.48                304.4        1.74   
1                    590   1.38                306.6        1.55   
2     (+1)           233   1.37                263.3        1.41   
3                    884   1.37                291.8        1.53   
4     (+1)           282   1.36                258.2        1.42   
..     ...           ...    ...                  ...         ...   
513   (+6)           200   0.67                155.5        0.60   
514   (+2)           231   0.67                158.6        0.67   
515   (+1)           213   0.65                159.1        0.62   
516   (+3)           231   0.65                154.7        0.60   
517   (+5)           258   0.61                135.0        0.53   

    Kill, Assist, Survive, Trade Percent Average Damage per Round  \
0                                    78%                    197.1   
1            

In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json 

# URL of the page containing the table
url = 'https://www.vlr.gg/player/25741/lied/?timespan=all'

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table
table = soup.find('table', class_='wf-table')

# Initialize lists to store the data
data = []
headers = []

# Get headers
for th in table.find_all('th'):
    title = th.get('title')
    if title:  # Only include headers that have a title
        headers.append(title)


# Get rows
for row in table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    if cols:  # Ensure there are columns in the row
        row_data = {}
        for i, col in enumerate(cols):
            # Clean the text and get only the inner text without tags
            text = col.get_text(strip=True)
            row_data[headers[i]] = text
        data.append(row_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Print the DataFrame for debugging purposes
print(df)


try:
    df.to_json('table_data.json', orient='records', lines=True)
    print("Data has been successfully converted and saved to table_data.json")
except Exception as e:
    print(f"Error saving JSON: {e}")
    
with open('table_data.json', 'r') as ndjson_file:
    # Read each line and convert to a list of dictionaries
    data = [json.loads(line) for line in ndjson_file]

# Write the data to a new JSON file as an array
with open('table_data_converted.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("Converted NDJSON to JSON array format.")


   Agent     Usage Rounds Played Rating Average Combat Score Kills:Death  \
0         (35) 24%           715   1.31                291.9        1.37   
1         (20) 14%           384   1.26                237.0        1.35   
2          (13) 9%           257   1.27                225.8        1.32   
3          (12) 8%           252   1.06                191.0        1.04   
4          (11) 8%           205   1.40                304.5        1.62   
5          (10) 7%           216   1.14                211.2        1.07   
6           (9) 6%           181   1.29                268.4        1.37   
7           (7) 5%           152   0.98                189.1        0.84   
8           (7) 5%           130   1.01                169.3        1.01   
9           (5) 3%           108   1.16                210.8        1.14   
10          (4) 3%            84   1.29                254.5        1.44   
11          (3) 2%            70   1.00                176.3        0.94   
12          

In [7]:
def scrape_event_data(event_name, event_id):
    url = f'https://www.vlr.gg/stats/?event_group_id={event_id}&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all'
    print(url)
    
    # Send an HTTP request to the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table with player stats
    table = soup.find('table')
    if table is None:
        print(f"No table found for {event_name}. Check the page structure or event ID.")
        return None  # Return None if no table is found
    
    # Extract the table headers
    headers = [th.text for th in table.find_all('th')]
    headers.append('Player URL')  # Add a column for player URLs
    
    # Extract table rows
    rows = []
    for row in table.find_all('tr')[1:]:
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]
        
        # Find the 'mod-agents' column that contains multiple images
        agent_td = row.find('td', class_='mod-agents')
        if agent_td:
            agent_imgs = agent_td.find_all('img')
            img_urls = [img['src'].replace('/img/vlr/game/agents/', '').replace('.png', '') for img in agent_imgs] if agent_imgs else ['N/A']
        else:
            img_urls = ['N/A']
        
        # Append the list of image URLs to the row (if you want to keep this part)
        row_data.append(', '.join(img_urls))  # Append the list of image URLs to the row
        
        # Find the player's profile link
        player_link = row.find('a')
        if player_link and 'href' in player_link.attrs:
            player_url = 'https://www.vlr.gg' + player_link['href'] + '?timespan=all'
        else:
            player_url = 'N/A'
        
        row_data.append(player_url)  # Append the player URL to the row
        rows.append(row_data)
    
    # Convert the data into a DataFrame
    df = pd.DataFrame(rows, columns=headers)
    
    return df  # Return the DataFrame
events = {
    'Valorant Game Changers 2023': '38',
    'Valorant Game Changers 2024': '62',
    'Challengers League 2023': '31',
    'Challengers League 2024': '59',
    'Valorant Champions Tour 2023': '45',
    'Valorant Champions Tour 2024': '61'
}


    
# Test DataFrame with a few player URLs
test_data = {
    'Player URL': [
        'https://www.vlr.gg/player/20064/vania/?timespan=all',  # Replace with actual player URLs
        'https://www.vlr.gg/player/25741/lied/?timespan=all'
    ]
}
test_df = pd.DataFrame(test_data)



In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to scrape data for a specific event
def scrape_event_data(event_name, event_id):
    url = f'https://www.vlr.gg/stats/?event_group_id={event_id}&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all'
    print(url)
    
    # Send an HTTP request to the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table with player stats
    table = soup.find('table')
    if table is None:
        print(f"No table found for {event_name}. Check the page structure or event ID.")
        return None  # Return None if no table is found
    
    # Extract the table headers
    headers = [th.text for th in table.find_all('th')]
    headers.append('Player URL')  # Add a column for player URLs
    
    # Extract table rows
    rows = []
    for row in table.find_all('tr')[1:]:
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]
        
        # Find the 'mod-agents' column that contains multiple images
        agent_td = row.find('td', class_='mod-agents')
        if agent_td:
            agent_imgs = agent_td.find_all('img')
            img_urls = [img['src'].replace('/img/vlr/game/agents/', '').replace('.png', '') for img in agent_imgs] if agent_imgs else ['N/A']
        else:
            img_urls = ['N/A']
        
        row_data.append(', '.join(img_urls))  # Append the list of image URLs to the row
        
        # Find the player's profile link
        player_link = row.find('a')
        if player_link and 'href' in player_link.attrs:
            player_url = 'https://www.vlr.gg' + player_link['href']+ '?timespan=all'
        else:
            player_url = 'N/A'
        
        
        row_data.append(player_url)  # Append the player URL to the row
        rows.append(row_data)
    
    # Convert the data into a DataFrame
    df = pd.DataFrame(rows, columns=headers)
    #print(df)
    
    return df  # Return the DataFrame

# List of events with correct event IDs
events = {
    'Valorant Game Changers 2023': '38',
    'Valorant Game Changers 2024': '62',
    'Challengers League 2023': '31',
    'Challengers League 2024': '59',
    'Valorant Champions Tour 2023': '45',
    'Valorant Champions Tour 2024': '61'
}

# List to hold DataFrames
all_dataframes = []

# Loop through each event and scrape data
for event_name, event_id in events.items():
    event_df = scrape_event_data(event_name, event_id)
    if event_df is not None:
        all_dataframes.append(event_df)  # Add the DataFrame to the list

# Concatenate all DataFrames and save as a single CSV
if all_dataframes:
    final_df = pd.concat(all_dataframes, ignore_index=True)
    final_df.to_csv('all_events_data.csv', index=False)
    print("All event data saved as all_events_data.csv")



# Replace 'Player Name' with the actual column name for player names in your dataset

final_df = final_df.drop_duplicates(subset=['Player'], keep='first')

# Save the cleaned DataFrame to a new CSV file
final_df.to_csv('cleaned_player_data.csv', index=False)

print("Duplicates removed and saved to cleaned_player_data.csv")

# Assuming the final DataFrame from the previous scraping step is named final_df
# You should replace this with the DataFrame you created earlier
# Scrape player profiles based on the DataFrame
         


https://www.vlr.gg/stats/?event_group_id=38&event_id=all&region=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all


ValueError: 22 columns passed, passed data had 23 columns

                                          Player URL  \
0  https://www.vlr.gg/player/17976/florescent?tim...   
1  https://www.vlr.gg/player/10653/suzu?timespan=all   
2  https://www.vlr.gg/player/30123/bakkushaw?time...   
3  https://www.vlr.gg/player/21265/keenc?timespan...   
4  https://www.vlr.gg/player/18215/rachell?timesp...   

                                          All Agents  \
0  [{'Agent Name': '', 'Image URL': 'https://www....   
1  [{'Agent Name': '', 'Image URL': 'https://www....   
2  [{'Agent Name': '', 'Image URL': 'https://www....   
3  [{'Agent Name': '', 'Image URL': 'https://www....   
4  [{'Agent Name': '', 'Image URL': 'https://www....   

                          Current Team  
0                    Shopify Rebellion  
1                     ZETA DIVISION GC  
2  Anorthosis Famagusta Esports Female  
3                     Evil Geniuses GC  
4                          Exalty Gaïa  
                                          Player URL  \
0  https://www.vlr.gg/pl

In [3]:
import requests
import pandas as pd

# Base API URL to fetch player list
api_url = 'https://vlr.orlandomm.net/api/v1/players'

# Parameters for the request to fetch all players
params = {
    'limit': 'all',  # Fetch all players
    'minrounds': 200,
    'minrating': 1550,
    'timespan': '400'
}

# Send a GET request to the API to fetch all players
response = requests.get(api_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON data
    data = response.json()

    # Extract player data
    players = data['data']

    # Initialize a list to store detailed player data
    detailed_players = []

    # Loop through each player to get detailed stats using their playerid
    for player in players:
        playerid = player['id']  # Extract player ID from the general list

        # API URL to get player stats using playerid
        player_stats_url = f'https://vlr.orlandomm.net/api/v1/players/{playerid}'

        # Send a GET request to fetch player stats
        stats_response = requests.get(player_stats_url)

        if stats_response.status_code == 200:
            # Parse the player's detailed data
            player_stats_data = stats_response.json()

            # Extract detailed player info
            info = player_stats_data['data']['info']
            team = player_stats_data['data'].get('team', {})
            results = player_stats_data['data'].get('results', [])
            past_teams = player_stats_data['data'].get('pastTeams', [])
            socials = player_stats_data['data'].get('socials', {})

            # Create a dictionary with the player's data
            player_info = {
                'Player ID': info.get('id', 'N/A'),
                'Username': info.get('user', 'N/A'),
                'Name': info.get('name', 'N/A'),
                'Country': info.get('country', 'N/A'),
                'Flag': info.get('flag', 'N/A'),
                'Player Image URL': info.get('img', 'N/A'),
                'Player Profile URL': info.get('url', 'N/A'),
                'Team ID': team.get('id', 'N/A'),
                'Team Name': team.get('name', 'N/A'),
                'Team Logo URL': team.get('logo', 'N/A'),
                'Team Joined': team.get('joined', 'N/A'),
                'Matches Played': len(results),
                'Past Teams': ', '.join([team['name'] for team in past_teams]) if past_teams else 'N/A',
                'Twitter': socials.get('twitter', 'N/A'),
                'Twitter URL': socials.get('twitter_url', 'N/A'),
                'Twitch': socials.get('twitch', 'N/A'),
                'Twitch URL': socials.get('twitch_url', 'N/A')
            }

            # Add the merged info to the list
            detailed_players.append(player_info)

        else:
            print(f"Failed to fetch stats for player {playerid}. Status code: {stats_response.status_code}")

    # Convert the detailed player data into a DataFrame
    df = pd.DataFrame(detailed_players)

    # Save to CSV
    df.to_csv('vlr_detailed_player_data.csv', index=False)
    print("CSV with detailed player data saved successfully!")

else:
    print(f"Failed to fetch player data. Status code: {response.status_code}")


Failed to fetch player data. Status code: 523
