In [1]:
# import dependencies
from bs4 import BeautifulSoup as bs
import requests
import re
import timeit
import pandas as pd

# Find Links

In [2]:
# url to the matches tab
url = 'https://www.vlr.gg/event/matches/2095/champions-tour-2024-americas-stage-2/?series_id=4031'

In [3]:
# scrape all the urls from the matches tab
print('Beginning URL Retrieval')
print('------------------------')

# get the html file using request
html_txt = requests.get(url)
soup = bs(html_txt.text, 'lxml')

try:
    # map to the correct location in the html file
    body = soup.find('body')
    div_container = body.find('div', class_='col-container')
    div_card = div_container.find_all('div', class_='wf-card')
    url_list = []
    perf_list = []
    over_list = []

    for idx, x in enumerate(div_card):
        if idx != 0:
            # create a base_url
            base_link = []
            base_url = f'https://www.vlr.gg'
            for link in div_card[idx].find_all('a'):
                base_link.append(link.get('href'))

            # fill the df with base urls
            for url in base_link:
                url_list.append(f'{base_url}{url}')
                
            # fill the df with the urls + /?game=all&tab=performance 
            # brings you to the performance tab 
            for url in base_link:
                perf_list.append(f'{base_url}{url}{"/?game=all&tab=performance"}')
                
            # fill the df with the urls + /?game=all&tab=overview 
            # brings you to the overview tab
            for url in base_link:
                over_list.append(f'{base_url}{url}{"/?game=all&tab=overview"}')

except AttributeError:
        print('There was a missing URL')

print('------------------------')
print(f'Found {len(url_list)} games!\n')

Beginning URL Retrieval
------------------------
------------------------
Found 25 games!



# Performance

In [4]:
def scrape_data(url_list):
    all_dfs = []  # List to store DataFrames from all URLs
    
    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Initialize a list to store DataFrames
            dfs = []

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            for game_div in game_divs:
                # Find the table within each game div
                table = game_div.find('table', class_='wf-table-inset mod-adv-stats')

                if table:
                    # Extract table data into a DataFrame
                    table_data = []
                    rows = table.find_all('tr')
                    for row in rows:
                        row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        table_data.append(row_data)

                    # Convert table_data into a DataFrame and append to dfs list
                    df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                    dfs.append(df)

            # Drop the first DataFrame (if available)
            if dfs:
                dfs.pop(0)  # Remove the first DataFrame
                all_dfs.extend(dfs)
            else:
                print('No DataFrames found for URL:', url)

        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_dfs

def process_data_frames(data_frames):
    processed_dfs = {}  # Dictionary to store processed DataFrames

    for i, df in enumerate(data_frames):
        df_copy = df.copy()

        # Rename the columns
        df_copy.columns = ['name', 'blank', '2K', '3K', '4K', '5K', '1v1', '1v2', '1v3', '1v4', '1v5', 'ECON', 'PL', 'DE']

        # Clean the 'name' column
        df_copy['name'] = df_copy['name'].str.strip().str.replace('\t', '').str.replace('\n', '')

        # Clean the team names
        team_names = ['MIBR', 'LEV', 'SEN', 'NRG', 'FUR', '100T', 'LOUD', 'EG', 'G2', 'C9', 'KRÜ']
        for team in team_names:
            df_copy['name'] = df_copy['name'].str.replace(team, '').str.strip()

        # Drop all unneeded columns
        df_copy = df_copy.drop(columns=['blank', '2K', '3K', '4K', '5K', 'ECON', 'PL', 'DE'])

        # Extract the first character from each cell in 'col1'
        df_copy['1v1'] = df_copy['1v1'].str.extract(r'^(\d+)')
        df_copy['1v2'] = df_copy['1v2'].str.extract(r'^(\d+)')
        df_copy['1v3'] = df_copy['1v3'].str.extract(r'^(\d+)')
        df_copy['1v4'] = df_copy['1v4'].str.extract(r'^(\d+)')
        df_copy['1v5'] = df_copy['1v5'].str.extract(r'^(\d+)')

        df_copy = df_copy.fillna(0)

        # Convert all columns except the first one to integers
        for column in df_copy.columns[1:]:
            df_copy[column] = df_copy[column].astype(int)

        # Calculate the total clutches and create a new column
        df_copy['clutches'] = df_copy[['1v1', '1v2', '1v3', '1v4', '1v5']].sum(axis=1)

        # Drop all unneeded columns
        df_copy = df_copy.drop(columns=['1v1', '1v2', '1v3', '1v4', '1v5'])

        # Store the processed DataFrame with a unique name
        processed_dfs[f'df_{i+1}'] = df_copy
    
    return processed_dfs

In [5]:
# Scrape the perf_list
data_frames = scrape_data(perf_list)
processed_dfs = process_data_frames(data_frames)

No DataFrames found for URL: https://www.vlr.gg/353197/cloud9-vs-kr-esports-champions-tour-2024-americas-stage-2-w4/?game=all&tab=performance
No DataFrames found for URL: https://www.vlr.gg/353198/evil-geniuses-vs-mibr-champions-tour-2024-americas-stage-2-w4/?game=all&tab=performance
No DataFrames found for URL: https://www.vlr.gg/353199/leviat-n-vs-furia-champions-tour-2024-americas-stage-2-w4/?game=all&tab=performance
No DataFrames found for URL: https://www.vlr.gg/353200/loud-vs-100-thieves-champions-tour-2024-americas-stage-2-w4/?game=all&tab=performance
No DataFrames found for URL: https://www.vlr.gg/353201/sentinels-vs-g2-esports-champions-tour-2024-americas-stage-2-w4/?game=all&tab=performance


In [6]:
# check the first df
# Using list conversion and index
entry_key = list(processed_dfs.keys())[0]
df = processed_dfs[entry_key]
df

Unnamed: 0,name,clutches
0,ShahZaM,0
1,liazzi,0
2,artzin,0
3,mazin,1
4,Pa1nt,0
5,C0M,0
6,tex,1
7,Mazino,0
8,aspas,0
9,kiNgg,0


# Overview

In [7]:
def clean_dataframe(df, team_names):
    if df.empty:
        return None
    
    df_copy = df.copy()

    # Rename the columns
    df_copy.columns = ['name', 'blank', 'rating', 'acs', 'kills', 'deaths', 'assists', 'k/d', 'KAST', 'adr', 'hs', 'fk', 'fd', 'fk/fd']

    # Clean the 'name' column
    df_copy['name'] = df_copy['name'].str.strip().str.replace('\t', '').str.replace('\n', '')
    
    # Clean the team names
    for team in team_names:
        df_copy['name'] = df_copy['name'].str.replace(team, '').str.strip()

    # Drop all unneeded columns
    df_copy = df_copy.drop(columns=['blank', 'rating', 'acs', 'k/d', 'KAST', 'hs', 'fk/fd'])
    
    # Apply a lambda function to extract the first number from each cell
    df_copy['kills'] = df_copy['kills'].apply(lambda x: x.split('\n')[0] if x else None)

    # Use a try-except block to handle potential errors in 'deaths' column processing
    try:
        df_copy['deaths'] = df_copy['deaths'].apply(lambda x: int(re.findall(r'\d+', x)[0]) if x else None)
    except IndexError:
        df_copy['deaths'] = None  # Handle the error by assigning a default value

    df_copy['assists'] = df_copy['assists'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['adr'] = df_copy['adr'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['fk'] = df_copy['fk'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['fd'] = df_copy['fd'].apply(lambda x: x.split('\n')[0] if x else None)

    return df_copy

def scrape_data(url_list):
    all_dfs = {}  # Dictionary to store processed DataFrames for each URL

    team_names = ['MIBR', 'LEV', 'SEN', 'NRG', 'FUR', '100T', 'LOUD', 'EG', 'G2', 'C9', 'KRÜ']

    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Initialize lists to store DataFrames for each pass
            first_pass_dfs = []
            second_pass_dfs = []

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            # First pass: Find initial tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')

                if table:
                    # Extract table data into a DataFrame
                    table_data = []
                    rows = table.find_all('tr')
                    for row in rows:
                        row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        table_data.append(row_data)

                    # Convert table_data into a DataFrame and append to first_pass_dfs list
                    df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                    first_pass_dfs.append(df)

            # Second pass: Find the next tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')
                if table:
                    next_table = table.find_next('table', class_='wf-table-inset mod-overview')
                    if next_table:
                        # Extract table data into a DataFrame
                        table_data = []
                        rows = next_table.find_all('tr')
                        for row in rows:
                            row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                            table_data.append(row_data)

                        # Convert table_data into a DataFrame and append to second_pass_dfs list
                        df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                        second_pass_dfs.append(df)

            # Process and clean DataFrames from both passes
            first_pass_cleaned = [clean_dataframe(df, team_names) for df in first_pass_dfs if not df.empty]
            second_pass_cleaned = [clean_dataframe(df, team_names) for df in second_pass_dfs if not df.empty]

            # Combine corresponding DataFrames from both passes
            combined_dfs = []
            min_length = min(len(first_pass_cleaned), len(second_pass_cleaned))
            for i in range(min_length):
                if first_pass_cleaned[i] is not None and second_pass_cleaned[i] is not None:
                    combined_df = pd.concat([first_pass_cleaned[i], second_pass_cleaned[i]], axis=0)
                    combined_dfs.append(combined_df)
                    combined_df.reset_index(inplace=True, drop=True)

            all_dfs[url] = combined_dfs

        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_dfs

In [8]:
data_frames = scrape_data(over_list)

In [9]:
# remove the second df from each series

# Create a list of new keys
new_keys = [f'Series {i+1}' for i in range(len(data_frames))]

# Create a new dictionary with updated keys
re_dfs = dict(zip(new_keys, data_frames.values()))

In [10]:
re_dfs['Series 2']

[       name kills  deaths assists  adr fk fd
 0    zekken    22      14       7  222  7  4
 1      Sacy    19      10       9  161  0  2
 2   Zellsis    16      10       6  131  1  0
 3      TenZ    13      13      14  127  1  1
 4    johnqt    10      11       5   96  2  2
 5       s0m    14      17       6  142  4  4
 6     Ethan    14      15      10  121  3  0
 7    Victor    11      17       5  116  2  6
 8   FiNESSE     9      16       4   79  1  0
 9  crashies    10      16       4   92  0  2,
        name kills  deaths assists  adr  fk  fd
 0    zekken    46      33      21  213  12   8
 1   Zellsis    37      24      22  135   2   1
 2      TenZ    36      29      24  134   5   3
 3      Sacy    38      26      17  145   3   5
 4    johnqt    25      28       9  102   3   4
 5       s0m    30      35      21  131   4   6
 6     Ethan    32      34      23  118   5   3
 7    Victor    34      41      10  150   9  13
 8  crashies    23      35      14   92   1   3
 9   FiNESSE 

In [11]:
# Iterate through the dictionary and remove the second item from each list value
for key in re_dfs:
    if len(re_dfs[key]) > 1:
        del re_dfs[key][1]  # Delete the second item (index 1)

In [12]:
re_dfs['Series 2'] 

[       name kills  deaths assists  adr fk fd
 0    zekken    22      14       7  222  7  4
 1      Sacy    19      10       9  161  0  2
 2   Zellsis    16      10       6  131  1  0
 3      TenZ    13      13      14  127  1  1
 4    johnqt    10      11       5   96  2  2
 5       s0m    14      17       6  142  4  4
 6     Ethan    14      15      10  121  3  0
 7    Victor    11      17       5  116  2  6
 8   FiNESSE     9      16       4   79  1  0
 9  crashies    10      16       4   92  0  2,
        name kills  deaths assists  adr fk fd
 0      TenZ    23      16      10  140  4  2
 1    zekken    24      19      14  206  5  4
 2   Zellsis    21      14      16  138  1  1
 3      Sacy    19      16       8  132  3  3
 4    johnqt    15      17       4  108  1  2
 5     Ethan    18      19      13  117  2  3
 6       s0m    16      18      15  123  0  2
 7    Victor    23      24       5  177  7  7
 8  crashies    13      19      10   92  1  1
 9   FiNESSE    12      22       

# Combining DFS

In [18]:
re_dfs['Series 1']

[      name kills  deaths assists  adr fk fd
 0    mazin    20      15      10  161  3  1
 1   artzin    18      15       9  139  7  0
 2  ShahZaM    12      14       9  122  2  1
 3   liazzi    13      14       5  126  0  2
 4    Pa1nt    11      17       6   98  2  4
 5      tex    20      15       2  179  3  3
 6   Mazino    22      15       9  175  1  3
 7    kiNgg    12      14       8  111  2  3
 8      C0M    10      14       7  106  0  1
 9    aspas    11      16       4  113  2  4,
       name kills  deaths assists  adr fk fd
 0    mazin    15      14       5  150  3  1
 1   artzin    16      15       4  150  0  3
 2  ShahZaM    11      15       5  127  1  3
 3    Pa1nt     8      18       6  105  2  5
 4   liazzi     8      16       1   80  1  1
 5    kiNgg    22      10       8  210  4  0
 6    aspas    21       8       4  179  5  1
 7      C0M    17      13       5  156  2  0
 8   Mazino     9      14      12  124  1  3
 9      tex     9      13       3  104  1  3]

In [33]:
# Initialize an empty list to store all DataFrames
all_dfs = []

# Iterate through each series in re_dfs
for dfs_list in re_dfs.values():
    # Extend all_dfs with the list of DataFrames for the current series
    all_dfs.extend(dfs_list)

In [61]:
all_dfs[1]

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd
0,mazin,15,14,5,150,3,1
1,artzin,16,15,4,150,0,3
2,ShahZaM,11,15,5,127,1,3
3,Pa1nt,8,18,6,105,2,5
4,liazzi,8,16,1,80,1,1
5,kiNgg,22,10,8,210,4,0
6,aspas,21,8,4,179,5,1
7,C0M,17,13,5,156,2,0
8,Mazino,9,14,12,124,1,3
9,tex,9,13,3,104,1,3


In [70]:
# Convert each dictionary entry to a dataframe and store them in a list
proc_dfs = [pd.DataFrame(processed_dfs[key]) for key in processed_dfs]

In [82]:
proc_dfs[1]

Unnamed: 0,name,clutches
0,ShahZaM,1
1,liazzi,0
2,artzin,0
3,mazin,1
4,Pa1nt,0
5,C0M,1
6,tex,0
7,Mazino,0
8,aspas,1
9,kiNgg,0


In [74]:
# Merge the two DataFrames on 'name'
combined_df = pd.merge(all_dfs[1], proc_dfs[1], on='name')

In [75]:
combined_df

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,clutches
0,mazin,15,14,5,150,3,1,1
1,artzin,16,15,4,150,0,3,0
2,ShahZaM,11,15,5,127,1,3,1
3,Pa1nt,8,18,6,105,2,5,0
4,liazzi,8,16,1,80,1,1,0
5,kiNgg,22,10,8,210,4,0,0
6,aspas,21,8,4,179,5,1,1
7,C0M,17,13,5,156,2,0,1
8,Mazino,9,14,12,124,1,3,0
9,tex,9,13,3,104,1,3,0


In [83]:
# Example of how to iterate and merge DataFrames from index 0 to 48
combined_dfs = []

for i in range(48):  # assuming you have 49 pairs of DataFrames
    all_df = all_dfs[i]
    proc_df = proc_dfs[i]
    
    # Merge the two DataFrames on 'name'
    combined_df = pd.merge(all_df, proc_df, on='name')
    
    # Append the merged DataFrame to the list
    combined_dfs.append(combined_df)

# combined_dfs now contains all merged DataFrames from index 0 to 48

In [84]:
combined_dfs

[      name kills  deaths assists  adr fk fd  clutches
 0    mazin    20      15      10  161  3  1         1
 1   artzin    18      15       9  139  7  0         0
 2  ShahZaM    12      14       9  122  2  1         0
 3   liazzi    13      14       5  126  0  2         0
 4    Pa1nt    11      17       6   98  2  4         0
 5      tex    20      15       2  179  3  3         1
 6   Mazino    22      15       9  175  1  3         0
 7    kiNgg    12      14       8  111  2  3         0
 8      C0M    10      14       7  106  0  1         0
 9    aspas    11      16       4  113  2  4         0,
       name kills  deaths assists  adr fk fd  clutches
 0    mazin    15      14       5  150  3  1         1
 1   artzin    16      15       4  150  0  3         0
 2  ShahZaM    11      15       5  127  1  3         1
 3    Pa1nt     8      18       6  105  2  5         0
 4   liazzi     8      16       1   80  1  1         0
 5    kiNgg    22      10       8  210  4  0         0
 6    asp

In [85]:
# Assuming combined_dfs is a list containing all merged DataFrames from index 0 to 48
# Example of concatenating all DataFrames vertically
combined_all = pd.concat(combined_dfs, ignore_index=True)

In [86]:
combined_all

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,clutches
0,mazin,20,15,10,161,3,1,1
1,artzin,18,15,9,139,7,0,0
2,ShahZaM,12,14,9,122,2,1,0
3,liazzi,13,14,5,126,0,2,0
4,Pa1nt,11,17,6,98,2,4,0
...,...,...,...,...,...,...,...,...
465,Cryocells,25,13,1,199,7,2,0
466,eeiu,19,9,5,161,2,1,0
467,Asuna,14,13,24,132,2,1,1
468,bang,14,12,12,137,1,1,0


In [94]:
# Assuming combined_all is your concatenated DataFrame containing all merged DataFrames
# Convert numeric columns to numeric types if necessary
combined_all['kills'] = pd.to_numeric(combined_all['kills'], errors='coerce')
combined_all['deaths'] = pd.to_numeric(combined_all['deaths'], errors='coerce')
combined_all['assists'] = pd.to_numeric(combined_all['assists'], errors='coerce')
combined_all['adr'] = pd.to_numeric(combined_all['adr'], errors='coerce')
combined_all['fk'] = pd.to_numeric(combined_all['fk'], errors='coerce')
combined_all['fd'] = pd.to_numeric(combined_all['fd'], errors='coerce')
combined_all['clutches'] = pd.to_numeric(combined_all['clutches'], errors='coerce')

# Sum each individual stat for each player
player_stats = combined_all.groupby('name').agg({
    'kills': 'sum',
    'deaths': 'sum',
    'assists': 'sum',
    'adr': 'mean',
    'fk': 'sum',
    'fd': 'sum',
    'clutches': 'sum'
}).reset_index()

In [97]:
combined_all

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,clutches
0,mazin,20,15,10,161,3,1,1
1,artzin,18,15,9,139,7,0,0
2,ShahZaM,12,14,9,122,2,1,0
3,liazzi,13,14,5,126,0,2,0
4,Pa1nt,11,17,6,98,2,4,0
...,...,...,...,...,...,...,...,...
465,Cryocells,25,13,1,199,7,2,0
466,eeiu,19,9,5,161,2,1,0
467,Asuna,14,13,24,132,2,1,1
468,bang,14,12,12,137,1,1,0


In [98]:
# Assuming combined_all is your DataFrame
# Save to CSV
combined_all.to_csv('database.csv', index=False)

In [95]:
player_stats

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,clutches
0,Apoth,113,150,58,112.1,16,25,4
1,Asuna,138,140,95,133.333333,29,27,3
2,Boostio,137,146,68,138.222222,14,15,3
3,C0M,96,100,64,114.75,15,9,3
4,Cryocells,149,148,45,127.888889,26,22,2
5,Derrek,149,138,71,131.3,13,7,2
6,Ethan,164,152,92,137.2,25,13,3
7,JonahP,114,112,74,114.125,18,12,5
8,Khalil,172,130,68,149.111111,15,13,4
9,Klaus,20,34,11,92.5,4,6,0


In [96]:
# Assuming player_stats is your DataFrame containing summed statistics for each player
# Convert 'name' column to lowercase for case-insensitive sorting
player_stats['name_lower'] = player_stats['name'].str.lower()

# Sort the DataFrame alphabetically by 'name_lower' column
player_stats_sorted = player_stats.sort_values(by='name_lower')

# Drop the 'name_lower' column if you don't need it anymore
player_stats_sorted = player_stats_sorted.drop(columns=['name_lower'])

# Print the sorted DataFrame
player_stats_sorted

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,clutches
0,Apoth,113,150,58,112.1,16,25,4
24,artzin,140,142,43,136.444444,22,19,3
25,aspas,159,102,31,174.625,37,23,1
1,Asuna,138,140,95,133.333333,29,27,3
26,bang,134,138,67,125.0,24,29,1
2,Boostio,137,146,68,138.222222,14,15,3
3,C0M,96,100,64,114.75,15,9,3
27,cauanzin,194,175,63,149.727273,32,24,2
28,crashies,126,137,70,114.5,9,13,4
4,Cryocells,149,148,45,127.888889,26,22,2
