In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import regex as re
import csv

In [2]:
# Function to read and print links from the file
def read_names(file_path):
    with open(file_path, 'r') as file:
        names = file.readlines()
        names = [name.strip() for name in names]
    return names

file_path = '../Data/name_list.txt'
# Access the links from the file
all_names = read_names(file_path)
print(f"Names read from file {file_path}")

# Function to read and print links from the file
def read_links(file_path):
    with open(file_path, 'r') as file:
        links = file.readlines()
        links = [link.strip() for link in links]
    return links

file_path = '../Data/over_list.txt'
# Access the links from the file
over_list = read_links(file_path)
print(f"Links read from file {file_path}")

Names read from file ../Data/name_list.txt
Links read from file ../Data/over_list.txt


In [3]:
def clean_dataframe(df, team_names):
    if df.empty:
        return None
    
    df_copy = df.copy()

    # Rename the columns
    df_copy.columns = ['name', 'blank', 'rating', 'acs', 'kills', 'deaths', 'assists', 'k/d', 'KAST', 'adr', 'hs', 'fk', 'fd', 'fk/fd']

    # Clean the 'name' column
    df_copy['name'] = df_copy['name'].str.strip().str.replace('\t', '').str.replace('\n', '')

    # Drop all unneeded columns
    df_copy = df_copy.drop(columns=['blank', 'rating', 'acs', 'k/d', 'KAST', 'hs', 'fk/fd'])
    
    # Apply a lambda function to extract the first number from each cell
    df_copy['kills'] = df_copy['kills'].apply(lambda x: x.split('\n')[0] if x else None)

    # Use a try-except block to handle potential errors in 'deaths' column processing
    try:
        df_copy['deaths'] = df_copy['deaths'].apply(lambda x: int(re.findall(r'\d+', x)[0]) if x else None)
    except IndexError:
        df_copy['deaths'] = None  # Handle the error by assigning a default value

    df_copy['assists'] = df_copy['assists'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['adr'] = df_copy['adr'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['fk'] = df_copy['fk'].apply(lambda x: x.split('\n')[0] if x else None)
    df_copy['fd'] = df_copy['fd'].apply(lambda x: x.split('\n')[0] if x else None)

    return df_copy

def scrape_data(url_list):
    all_dfs = {}  # Dictionary to store processed DataFrames for each URL

    team_names = ['MIBR', 'LEV', 'SEN', 'NRG', 'FUR', '100T', 'LOUD', 'EG', 'G2', 'C9', 'KRÜ']

    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Initialize lists to store DataFrames for each pass
            first_pass_dfs = []
            second_pass_dfs = []

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            # First pass: Find initial tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')

                if table:
                    # Extract table data into a DataFrame
                    table_data = []
                    rows = table.find_all('tr')
                    for row in rows:
                        row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                        table_data.append(row_data)

                    # Convert table_data into a DataFrame and append to first_pass_dfs list
                    df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                    first_pass_dfs.append(df)

            # Second pass: Find the next tables
            for game_div in game_divs:
                table = game_div.find('table', class_='wf-table-inset mod-overview')
                if table:
                    next_table = table.find_next('table', class_='wf-table-inset mod-overview')
                    if next_table:
                        # Extract table data into a DataFrame
                        table_data = []
                        rows = next_table.find_all('tr')
                        for row in rows:
                            row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
                            table_data.append(row_data)

                        # Convert table_data into a DataFrame and append to second_pass_dfs list
                        df = pd.DataFrame(table_data[1:], columns=table_data[0])  # Assuming first row is header
                        second_pass_dfs.append(df)

            # Process and clean DataFrames from both passes
            first_pass_cleaned = [clean_dataframe(df, team_names) for df in first_pass_dfs if not df.empty]
            second_pass_cleaned = [clean_dataframe(df, team_names) for df in second_pass_dfs if not df.empty]

            # Combine corresponding DataFrames from both passes
            combined_dfs = []
            min_length = min(len(first_pass_cleaned), len(second_pass_cleaned))
            for i in range(min_length):
                if first_pass_cleaned[i] is not None and second_pass_cleaned[i] is not None:
                    combined_df = pd.concat([first_pass_cleaned[i], second_pass_cleaned[i]], axis=0)
                    combined_dfs.append(combined_df)
                    combined_df.reset_index(inplace=True, drop=True)

            all_dfs[url] = combined_dfs

        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_dfs

In [4]:
data_frames = scrape_data(over_list)

In [5]:
# remove the second df from each series

# Create a list of new keys
new_keys = [f'Series {i+1}' for i in range(len(data_frames))]

# Create a new dictionary with updated keys
re_dfs = dict(zip(new_keys, data_frames.values()))

In [6]:
re_dfs['Series 1']

[           name kills  deaths assists  adr fk fd
 0    mazin MIBR    20      15      10  161  3  1
 1   artzin MIBR    18      15       9  139  7  0
 2  ShahZaM MIBR    12      14       9  122  2  1
 3   liazzi MIBR    13      14       5  126  0  2
 4    Pa1nt MIBR    11      17       6   98  2  4
 5       tex LEV    20      15       2  179  3  3
 6    Mazino LEV    22      15       9  175  1  3
 7     kiNgg LEV    12      14       8  111  2  3
 8       C0M LEV    10      14       7  106  0  1
 9     aspas LEV    11      16       4  113  2  4,
            name kills  deaths assists  adr fk fd
 0    mazin MIBR    35      29      15  156  6  2
 1   artzin MIBR    34      30      13  144  7  3
 2  ShahZaM MIBR    23      29      14  124  3  4
 3   liazzi MIBR    21      30       6  104  1  3
 4    Pa1nt MIBR    19      35      12  101  4  9
 5     kiNgg LEV    34      24      16  158  6  3
 6     aspas LEV    32      24       8  144  7  5
 7    Mazino LEV    31      29      21  150  2  6

In [7]:
# Iterate through the dictionary and remove the second item from each list value
for key in re_dfs:
    if len(re_dfs[key]) > 1:
        del re_dfs[key][1]  # Delete the second item (index 1)

In [8]:
re_dfs['Series 1'] 

[           name kills  deaths assists  adr fk fd
 0    mazin MIBR    20      15      10  161  3  1
 1   artzin MIBR    18      15       9  139  7  0
 2  ShahZaM MIBR    12      14       9  122  2  1
 3   liazzi MIBR    13      14       5  126  0  2
 4    Pa1nt MIBR    11      17       6   98  2  4
 5       tex LEV    20      15       2  179  3  3
 6    Mazino LEV    22      15       9  175  1  3
 7     kiNgg LEV    12      14       8  111  2  3
 8       C0M LEV    10      14       7  106  0  1
 9     aspas LEV    11      16       4  113  2  4,
            name kills  deaths assists  adr fk fd
 0    mazin MIBR    15      14       5  150  3  1
 1   artzin MIBR    16      15       4  150  0  3
 2  ShahZaM MIBR    11      15       5  127  1  3
 3    Pa1nt MIBR     8      18       6  105  2  5
 4   liazzi MIBR     8      16       1   80  1  1
 5     kiNgg LEV    22      10       8  210  4  0
 6     aspas LEV    21       8       4  179  5  1
 7       C0M LEV    17      13       5  156  2  0

In [9]:
# Convert dictionary keys to list
values_list = list(re_dfs.values())

# Flatten the list of lists into a single list of lists
flattened_list = [item for sublist in values_list for item in sublist]

In [10]:
# Initialize a game_id counter
game_id = 0

# Add game_id column to each DataFrame in the list
for df in flattened_list:
    df['game_id'] = game_id
    game_id += 1

In [11]:
# Concatenate all DataFrames in the list
concatenated_df = pd.concat(flattened_list, ignore_index=True)

# Extract team_abrev from name column
concatenated_df['team_abrev'] = concatenated_df['name'].apply(lambda x: x.split()[-1])

In [12]:
concatenated_df

Unnamed: 0,name,kills,deaths,assists,adr,fk,fd,game_id,team_abrev
0,mazin MIBR,20,15,10,161,3,1,0,MIBR
1,artzin MIBR,18,15,9,139,7,0,0,MIBR
2,ShahZaM MIBR,12,14,9,122,2,1,0,MIBR
3,liazzi MIBR,13,14,5,126,0,2,0,MIBR
4,Pa1nt MIBR,11,17,6,98,2,4,0,MIBR
...,...,...,...,...,...,...,...,...,...
625,trent G2,,,,,,,62,G2
626,valyn G2,,,,,,,62,G2
627,JonahP G2,,,,,,,62,G2
628,leaf G2,,,,,,,62,G2


In [13]:
stat_names = concatenated_df['name'].unique()
stat_names_sorted = sorted(stat_names, key=str.lower)

In [14]:
replace_data = {'name': ['Apoth EG', 'artzin MIBR', 'aspas LEV', 'Asuna 100T', 'bang 100T', 'Boostio 100T', 'C0M LEV', 
                         'cauanzin LOUD', 'crashies NRG', 'Cryocells 100T', 'Derrek EG', 'eeiu 100T', 'Ethan NRG', 
                         'FiNESSE NRG', 'havoc FUR', 'heat KRÜ', 'icy G2', 'jawgemo EG', 'johnqt SEN', 'JonahP G2', 
                         'keznit KRÜ', 'Khalil FUR', 'kiNgg LEV', 'Klaus KRÜ', 'leaf G2', 'Less LOUD', 'liazzi MIBR', 
                         'mazin MIBR', 'Mazino LEV', 'Melser KRÜ', 'moose C9', 'mta KRÜ', 'mwzera FUR', 'NaturE EG', 
                         'nzr FUR', 'OXY C9', 'Pa1nt MIBR', 'Palla MIBR', 'pANcada LOUD', 'rich MIBR', 'runi C9', 's0m NRG', 
                         'saadhak LOUD', 'Sacy SEN', 'ShahZaM MIBR', 'Shyy KRÜ', 'supamen EG', 'TenZ SEN', 'tex LEV', 
                         'trent G2', 'tuyz LOUD', 'valyn G2', 'vanity C9', 'Victor NRG', 'xand FUR', 'Xeppaa C9', 
                         'zekken SEN', 'Zellsis SEN'],
                'player_name': ['Apoth', 'artzin', 'aspas', 'Asuna', 'bang', 'Boostio', 'C0M', 'cauanzin', 'crashies', 
                               'Cryocells', 'Derrek', 'eeiu', 'Ethan', 'FiNESSE', 'havoc', 'heat', 'icy', 'jawgemo', 'johnqt', 
                               'JonahP', 'keznit', 'Khalil', 'kiNgg', 'Klaus', 'leaf', 'Less', 'liazzi', 'mazin', 'Mazino', 
                               'Melser', 'moose', 'mta', 'mwzera', 'NaturE', 'nzr', 'OXY', 'Pa1nt', 'Palla', 'pANcada', 'rich', 
                               'runi', 's0m', 'saadhak', 'Sacy', 'ShahZaM', 'Shyy', 'supamen', 'TenZ', 'tex', 'trent', 'tuyz', 
                               'valyn', 'vanity', 'Victor', 'xand', 'Xeppaa', 'zekken', 'Zellsis']}

replace_df = pd.DataFrame(replace_data)

In [15]:
# Replace names in df using replace_df
for index, row in replace_df.iterrows():
    concatenated_df['name'] = concatenated_df['name'].replace(row['name'], row['player_name'])

concatenated_df.columns

Index(['name', 'kills', 'deaths', 'assists', 'adr', 'fk', 'fd', 'game_id',
       'team_abrev'],
      dtype='object')

In [16]:
new_names = ['player_name', 'kills', 'deaths', 'assists', 'adr', 'fk', 'fd', 'game_id', 'team_abrev']
concatenated_df.columns = new_names

In [17]:
new_order = ['game_id', 'player_name', 'team_abrev', 'kills', 'deaths', 'assists', 'adr', 'fk', 'fd']

concatenated_df = concatenated_df[new_order]

In [18]:
concatenated_df.tail(10)

Unnamed: 0,game_id,player_name,team_abrev,kills,deaths,assists,adr,fk,fd
620,62,TenZ,SEN,,,,,,
621,62,zekken,SEN,,,,,,
622,62,Sacy,SEN,,,,,,
623,62,johnqt,SEN,,,,,,
624,62,Zellsis,SEN,,,,,,
625,62,trent,G2,,,,,,
626,62,valyn,G2,,,,,,
627,62,JonahP,G2,,,,,,
628,62,leaf,G2,,,,,,
629,62,icy,G2,,,,,,


In [19]:
# Save DataFrame to CSV file
concatenated_df.to_csv('../Data/over_df.csv', index=False)