In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import regex as re
import csv

In [2]:
# Function to read and print links from the file
def read_names(file_path):
    with open(file_path, 'r') as file:
        names = file.readlines()
        names = [name.strip() for name in names]
    return names

file_path = '../Data/name_list.txt'
# Access the links from the file
all_names = read_names(file_path)
print(f"Names read from file {file_path}")

file_path = '../Data/over_list.txt'
# Access the links from the file
over_list = read_links(file_path)
print(f"Links read from file {file_path}")

Names read from file ../Data/name_list.txt
Links read from file ../Data/over_list.txt


In [45]:
def clean_name_agent_dataframe(df):
    if df.empty:
        return None

    df_copy = df.copy()

    # Clean the 'name' and 'agent' columns
    df_copy['name'] = df_copy['name'].str.strip().str.replace('\t', '').str.replace('\n', '')
    df_copy['agent'] = df_copy['agent'].str.strip().str.replace('\t', '').str.replace('\n', '')

    return df_copy

def extract_table_data(table):
    """Extracts player name and agent data from a given table."""
    table_data = []
    rows = table.find_all('tr')
    for row in rows:
        player_name_cell = row.find('td', class_='mod-player')
        agent_name_cell = row.find('td', class_='mod-agents')
        if player_name_cell and agent_name_cell:
            player_name = player_name_cell.text.strip()
            agent_name = agent_name_cell.img.get('title', 'Unknown Agent').strip() if agent_name_cell.img else 'Unknown Agent'
            table_data.append([player_name, agent_name])
    return pd.DataFrame(table_data, columns=['name', 'agent'])

def scrape_name_agent_data(url_list):
    all_dfs = {}  # Dictionary to store processed DataFrames for each URL

    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Initialize lists to store DataFrames for each pass
            first_pass_dfs = []
            second_pass_dfs = []

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            # First pass: Find initial tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')
                if table:
                    df = extract_table_data(table)
                    first_pass_dfs.append(df)

            # Second pass: Find the next tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')
                if table:
                    next_table = table.find_next('table', class_='wf-table-inset mod-overview')
                    if next_table:
                        df = extract_table_data(next_table)
                        second_pass_dfs.append(df)

            # Process and clean DataFrames from both passes
            first_pass_cleaned = [clean_name_agent_dataframe(df) for df in first_pass_dfs if not df.empty]
            second_pass_cleaned = [clean_name_agent_dataframe(df) for df in second_pass_dfs if not df.empty]

            # Combine corresponding DataFrames from both passes
            combined_dfs = []
            min_length = min(len(first_pass_cleaned), len(second_pass_cleaned))
            for i in range(min_length):
                if first_pass_cleaned[i] is not None and second_pass_cleaned[i] is not None:
                    combined_df = pd.concat([first_pass_cleaned[i], second_pass_cleaned[i]], axis=0).reset_index(drop=True)
                    combined_dfs.append(combined_df)

            all_dfs[url] = combined_dfs

        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_dfs

In [46]:
result = scrape_name_agent_data(over_list)

In [47]:
# remove the second df from each series

# Create a list of new keys
new_keys = [f'Series {i+1}' for i in range(len(result))]

# Create a new dictionary with updated keys
re_dfs = dict(zip(new_keys, result.values()))

In [48]:
re_dfs['Series 1']

[           name    agent
 0    mazin MIBR     Omen
 1   artzin MIBR     Yoru
 2  ShahZaM MIBR     Sova
 3   liazzi MIBR  Killjoy
 4    Pa1nt MIBR      Iso
 5       tex LEV  Killjoy
 6    Mazino LEV     Kayo
 7     kiNgg LEV     Omen
 8       C0M LEV     Sova
 9     aspas LEV     Jett,
            name    agent
 0    mazin MIBR     Omen
 1   artzin MIBR     Yoru
 2  ShahZaM MIBR     Sova
 3   liazzi MIBR  Killjoy
 4    Pa1nt MIBR      Iso
 5     kiNgg LEV     Omen
 6     aspas LEV     Jett
 7    Mazino LEV     Kayo
 8       C0M LEV     Sova
 9       tex LEV  Killjoy,
            name    agent
 0    mazin MIBR    Viper
 1   artzin MIBR    Gekko
 2  ShahZaM MIBR     Sova
 3    Pa1nt MIBR     Jett
 4   liazzi MIBR  Killjoy
 5     kiNgg LEV    Viper
 6     aspas LEV     Jett
 7       C0M LEV     Sova
 8    Mazino LEV   Harbor
 9       tex LEV  Killjoy]

In [49]:
# Iterate through the dictionary and remove the second item from each list value
for key in re_dfs:
    if len(re_dfs[key]) > 1:
        del re_dfs[key][1]  # Delete the second item (index 1)

In [50]:
re_dfs['Series 1']

[           name    agent
 0    mazin MIBR     Omen
 1   artzin MIBR     Yoru
 2  ShahZaM MIBR     Sova
 3   liazzi MIBR  Killjoy
 4    Pa1nt MIBR      Iso
 5       tex LEV  Killjoy
 6    Mazino LEV     Kayo
 7     kiNgg LEV     Omen
 8       C0M LEV     Sova
 9     aspas LEV     Jett,
            name    agent
 0    mazin MIBR    Viper
 1   artzin MIBR    Gekko
 2  ShahZaM MIBR     Sova
 3    Pa1nt MIBR     Jett
 4   liazzi MIBR  Killjoy
 5     kiNgg LEV    Viper
 6     aspas LEV     Jett
 7       C0M LEV     Sova
 8    Mazino LEV   Harbor
 9       tex LEV  Killjoy]

In [51]:
# Convert dictionary keys to list
values_list = list(re_dfs.values())

# Flatten the list of lists into a single list of lists
flattened_list = [item for sublist in values_list for item in sublist]

In [52]:
# Initialize a game_id counter
game_id = 0

# Add game_id column to each DataFrame in the list
for df in flattened_list:
    df['game_id'] = game_id
    game_id += 1

In [53]:
# Concatenate all DataFrames in the list
concatenated_df = pd.concat(flattened_list, ignore_index=True)

In [54]:
stat_names = concatenated_df['name'].unique()
stat_names_sorted = sorted(stat_names, key=str.lower)

In [55]:
replace_data = {'name': ['Apoth EG', 'artzin MIBR', 'aspas LEV', 'Asuna 100T', 'bang 100T', 'Boostio 100T', 'C0M LEV', 
                         'cauanzin LOUD', 'crashies NRG', 'Cryocells 100T', 'Derrek EG', 'eeiu 100T', 'Ethan NRG', 
                         'FiNESSE NRG', 'havoc FUR', 'heat KRÜ', 'icy G2', 'jawgemo EG', 'johnqt SEN', 'JonahP G2', 
                         'keznit KRÜ', 'Khalil FUR', 'kiNgg LEV', 'Klaus KRÜ', 'leaf G2', 'Less LOUD', 'liazzi MIBR', 
                         'mazin MIBR', 'Mazino LEV', 'Melser KRÜ', 'moose C9', 'mta KRÜ', 'mwzera FUR', 'NaturE EG', 
                         'nzr FUR', 'OXY C9', 'Pa1nt MIBR', 'Palla MIBR', 'pANcada LOUD', 'rich MIBR', 'runi C9', 's0m NRG', 
                         'saadhak LOUD', 'Sacy SEN', 'ShahZaM MIBR', 'Shyy KRÜ', 'supamen EG', 'TenZ SEN', 'tex LEV', 
                         'trent G2', 'tuyz LOUD', 'valyn G2', 'vanity C9', 'Victor NRG', 'xand FUR', 'Xeppaa C9', 
                         'zekken SEN', 'Zellsis SEN'],
                'player_name': ['Apoth', 'artzin', 'aspas', 'Asuna', 'bang', 'Boostio', 'C0M', 'cauanzin', 'crashies', 
                               'Cryocells', 'Derrek', 'eeiu', 'Ethan', 'FiNESSE', 'havoc', 'heat', 'icy', 'jawgemo', 'johnqt', 
                               'JonahP', 'keznit', 'Khalil', 'kiNgg', 'Klaus', 'leaf', 'Less', 'liazzi', 'mazin', 'Mazino', 
                               'Melser', 'moose', 'mta', 'mwzera', 'NaturE', 'nzr', 'OXY', 'Pa1nt', 'Palla', 'pANcada', 'rich', 
                               'runi', 's0m', 'saadhak', 'Sacy', 'ShahZaM', 'Shyy', 'supamen', 'TenZ', 'tex', 'trent', 'tuyz', 
                               'valyn', 'vanity', 'Victor', 'xand', 'Xeppaa', 'zekken', 'Zellsis']}

replace_df = pd.DataFrame(replace_data)

In [56]:
# Replace names in df using replace_df
for index, row in replace_df.iterrows():
    concatenated_df['name'] = concatenated_df['name'].replace(row['name'], row['player_name'])

concatenated_df.columns

Index(['name', 'agent', 'game_id'], dtype='object')

In [57]:
new_names = ['player_name', 'agent', 'game_id']
concatenated_df.columns = new_names

In [58]:
new_order = ['game_id', 'player_name', 'agent']

concatenated_df = concatenated_df[new_order]

In [60]:
concatenated_df.tail(40)

Unnamed: 0,game_id,player_name,agent
590,59,saadhak,Unknown Agent
591,59,Less,Unknown Agent
592,59,tuyz,Unknown Agent
593,59,cauanzin,Unknown Agent
594,59,pANcada,Unknown Agent
595,59,Asuna,Unknown Agent
596,59,bang,Unknown Agent
597,59,Cryocells,Unknown Agent
598,59,eeiu,Unknown Agent
599,59,Boostio,Unknown Agent


In [62]:
# Save DataFrame to CSV file
concatenated_df.to_csv('../Data/agent_data.csv', index=False)