In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import regex as re

In [2]:
def clean_dataframe(df, team_names):
    if df.empty:
        return None
    
    df_copy = df.copy()

    # Rename the columns
    df_copy.columns = ['name', 'blank', '2K', '3K', '4K', '5K', '1v1', '1v2', '1v3', '1v4', '1v5', 'ECON', 'PL', 'DE']
        
    # Clean the 'name' column
    df_copy['name'] = df_copy['name'].str.strip().str.replace('\t', '').str.replace('\n', '')
    
#     # Clean the team names
#     for team in team_names:
#         df_copy['name'] = df_copy['name'].str.replace(team, '').str.strip()

    # Drop all unneeded columns
    df_copy = df_copy.drop(columns=['blank', '2K', '3K', '4K', 'ECON', 'PL', 'DE'])
    
    # Apply a lambda function to extract the first number from each cell
    df_copy['1v1'] = df_copy['1v1'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['1v2'] = df_copy['1v2'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['1v3'] = df_copy['1v3'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['1v4'] = df_copy['1v4'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['1v5'] = df_copy['1v5'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['5K'] = df_copy['5K'].apply(lambda x: x.split('\n')[0] if x else None)
    
    df_copy = df_copy.fillna(0)
    
     # Convert all columns except the first one to integers
    for column in df_copy.columns[1:]:
        df_copy[column] = df_copy[column].astype(int)

    # Calculate the total clutches and create a new column
    df_copy['clutches'] = df_copy[['1v1', '1v2', '1v3', '1v4', '1v5']].sum(axis=1)

    # Drop all unneeded columns
    df_copy = df_copy.drop(columns=['1v1', '1v2', '1v3', '1v4', '1v5'])

    return df_copy

def scrape_data(url_list):
    all_dfs = {}  # Dictionary to store processed DataFrames for each URL

    team_names = ['MIBR', 'LEV', 'SEN', 'NRG', 'FUR', '100T', 'LOUD', 'EG', 'G2', 'C9', 'KRÜ']

    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Initialize lists to store DataFrames for each pass
            first_pass_dfs = []

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            # First pass: Find initial tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-adv-stats')

                if table:
                    # Extract table data into a DataFrame
                    table_data = []
                    rows = table.find_all('tr')
                    for row in rows:
                        row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        table_data.append(row_data)

                    # Convert table_data into a DataFrame and append to first_pass_dfs list
                    df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                    first_pass_dfs.append(df)

            # Process and clean DataFrames from both passes
            first_pass_cleaned = [clean_dataframe(df, team_names) for df in first_pass_dfs if not df.empty]

            # Combine corresponding DataFrames from both passes
            combined_dfs = []
            min_length = min(len(first_pass_cleaned), len(first_pass_cleaned))
            for i in range(min_length):
                if first_pass_cleaned[i] is not None:
                    combined_df = pd.concat([first_pass_cleaned[i]], axis=0)
                    combined_dfs.append(combined_df)
                    combined_df.reset_index(inplace=True, drop=True)

            all_dfs[url] = combined_dfs

        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_dfs

In [3]:
# Function to read and print links from the file
def read_links(file_path):
    with open(file_path, 'r') as file:
        links = file.readlines()
        links = [link.strip() for link in links]
    return links

file_path = '../Data/perf_list.txt'
# Access the links from the file
perf_list = read_links(file_path)
print(f"Links read from file {file_path}")

data_frames = scrape_data(perf_list)

Links read from file ../Data/perf_list.txt


In [4]:
# Using list conversion and index
entry_key = list(data_frames.keys())[1]
df = data_frames[entry_key]
df

[          name  5K  clutches
 0      TenZSEN   0         0
 1      SacySEN   0         0
 2   ZellsisSEN   1         2
 3    johnqtSEN   0         0
 4    zekkenSEN   0         0
 5  crashiesNRG   0         1
 6    VictorNRG   0         0
 7   FiNESSENRG   0         1
 8       s0mNRG   0         3
 9     EthanNRG   0         0,
           name  5K  clutches
 0      TenZSEN   0         0
 1      SacySEN   0         0
 2   ZellsisSEN   1         0
 3    johnqtSEN   0         0
 4    zekkenSEN   0         0
 5  crashiesNRG   0         0
 6    VictorNRG   0         0
 7   FiNESSENRG   0         0
 8       s0mNRG   0         1
 9     EthanNRG   0         0,
           name  5K  clutches
 0      TenZSEN   0         0
 1      SacySEN   0         0
 2   ZellsisSEN   0         2
 3    johnqtSEN   0         0
 4    zekkenSEN   0         0
 5  crashiesNRG   0         1
 6    VictorNRG   0         0
 7   FiNESSENRG   0         1
 8       s0mNRG   0         2
 9     EthanNRG   0         0]

In [5]:
# Iterate through the dictionary and remove the first item from each list value
for key in data_frames:
    if len(data_frames[key]) > 1:
        del data_frames[key][0]  # Delete the second item (index 1)

In [6]:
# Using list conversion and index
entry_key = list(data_frames.keys())[1]
df = data_frames[entry_key]
df

[          name  5K  clutches
 0      TenZSEN   0         0
 1      SacySEN   0         0
 2   ZellsisSEN   1         0
 3    johnqtSEN   0         0
 4    zekkenSEN   0         0
 5  crashiesNRG   0         0
 6    VictorNRG   0         0
 7   FiNESSENRG   0         0
 8       s0mNRG   0         1
 9     EthanNRG   0         0,
           name  5K  clutches
 0      TenZSEN   0         0
 1      SacySEN   0         0
 2   ZellsisSEN   0         2
 3    johnqtSEN   0         0
 4    zekkenSEN   0         0
 5  crashiesNRG   0         1
 6    VictorNRG   0         0
 7   FiNESSENRG   0         1
 8       s0mNRG   0         2
 9     EthanNRG   0         0]

In [7]:
# Convert dictionary keys to list
values_list = list(data_frames.values())

# Flatten the list of lists into a single list of lists
flattened_list = [item for sublist in values_list for item in sublist]

In [8]:
# Initialize a game_id counter
game_id = 0

# Add game_id column to each DataFrame in the list
for df in flattened_list:
    df['game_id'] = game_id
    game_id += 1

In [9]:
# Concatenate all DataFrames in the list
concatenated_df = pd.concat(flattened_list, ignore_index=True)

In [10]:
stat_names = concatenated_df['name'].unique()
stat_names_sorted = sorted(stat_names, key=str.lower)
print(stat_names_sorted)

['ApothEG', 'artzinMIBR', 'aspasLEV', 'Asuna100T', 'bang100T', 'Boostio100T', 'C0MLEV', 'cauanzinLOUD', 'crashiesNRG', 'Cryocells100T', 'DerrekEG', 'eeiu100T', 'EthanNRG', 'FiNESSENRG', 'havocFUR', 'heatKRÜ', 'icyG2', 'jawgemoEG', 'johnqtSEN', 'JonahPG2', 'keznitKRÜ', 'KhalilFUR', 'kiNggLEV', 'KlausKRÜ', 'leafG2', 'LessLOUD', 'liazziMIBR', 'mazinMIBR', 'MazinoLEV', 'MelserKRÜ', 'mooseC9', 'mtaKRÜ', 'mwzeraFUR', 'NaturEEG', 'nzrFUR', 'OXYC9', 'Pa1ntMIBR', 'PallaMIBR', 'pANcadaLOUD', 'richMIBR', 'runiC9', 's0mNRG', 'saadhakLOUD', 'SacySEN', 'ShahZaMMIBR', 'ShyyKRÜ', 'supamenEG', 'TenZSEN', 'texLEV', 'trentG2', 'tuyzLOUD', 'valynG2', 'vanityC9', 'VictorNRG', 'xandFUR', 'XeppaaC9', 'zekkenSEN', 'ZellsisSEN']


In [11]:
replace_data = {'name': ['ApothEG', 'artzinMIBR', 'aspasLEV', 'Asuna100T', 'bang100T', 'Boostio100T', 'C0MLEV', 'cauanzinLOUD', 
                         'crashiesNRG', 'Cryocells100T', 'DerrekEG', 'eeiu100T', 'EthanNRG', 'FiNESSENRG', 'havocFUR', 'heatKRÜ', 
                         'icyG2', 'jawgemoEG', 'johnqtSEN', 'JonahPG2', 'keznitKRÜ', 'KhalilFUR', 'kiNggLEV', 'KlausKRÜ', 
                         'leafG2', 'LessLOUD', 'liazziMIBR','mazinMIBR', 'MazinoLEV', 'MelserKRÜ', 'mooseC9', 'mtaKRÜ', 'mwzeraFUR', 
                         'NaturEEG', 'nzrFUR', 'OXYC9', 'Pa1ntMIBR', 'PallaMIBR', 'pANcadaLOUD', 'richMIBR', 'runiC9', 's0mNRG', 
                         'saadhakLOUD', 'SacySEN', 'ShahZaMMIBR', 'ShyyKRÜ', 'supamenEG', 'TenZSEN', 'texLEV', 'trentG2', 
                         'tuyzLOUD', 'valynG2', 'vanityC9', 'VictorNRG', 'xandFUR', 'XeppaaC9', 'zekkenSEN', 'ZellsisSEN'],
                'player_name': ['Apoth', 'artzin', 'aspas', 'Asuna', 'bang', 'Boostio', 'C0M', 'cauanzin', 'crashies', 
                               'Cryocells', 'Derrek', 'eeiu', 'Ethan', 'FiNESSE', 'havoc', 'heat', 'icy', 'jawgemo', 'johnqt', 
                               'JonahP', 'keznit', 'Khalil', 'kiNgg', 'Klaus', 'leaf', 'Less', 'liazzi', 'mazin', 'Mazino', 
                               'Melser', 'moose', 'mta', 'mwzera', 'NaturE', 'nzr', 'OXY', 'Pa1nt', 'Palla', 'pANcada', 'rich', 
                               'runi', 's0m', 'saadhak', 'Sacy', 'ShahZaM', 'Shyy', 'supamen', 'TenZ', 'tex', 'trent', 'tuyz', 
                               'valyn', 'vanity', 'Victor', 'xand', 'Xeppaa', 'zekken', 'Zellsis']}

replace_df = pd.DataFrame(replace_data)

In [12]:
# Replace names in df using replace_df
for index, row in replace_df.iterrows():
    concatenated_df['name'] = concatenated_df['name'].replace(row['name'], row['player_name'])

concatenated_df.columns

Index(['name', '5K', 'clutches', 'game_id'], dtype='object')

In [13]:
new_names = ['player_name', 'aces', 'clutches', 'game_id']
concatenated_df.columns = new_names

In [14]:
new_order = ['game_id', 'player_name', 'clutches', 'aces']

concatenated_df = concatenated_df[new_order]

In [15]:
concatenated_df.head(30)

Unnamed: 0,game_id,player_name,clutches,aces
0,0,ShahZaM,0,0
1,0,liazzi,0,0
2,0,artzin,0,0
3,0,mazin,1,0
4,0,Pa1nt,0,0
5,0,C0M,0,0
6,0,tex,1,0
7,0,Mazino,0,0
8,0,aspas,0,0
9,0,kiNgg,0,0


In [16]:
# Save DataFrame to CSV file
concatenated_df.to_csv('../Data/perf_df.csv', index=False)