In [1]:
# import dependencies
from bs4 import BeautifulSoup as bs
import requests
import re
import timeit
import pandas as pd

In [2]:
# Function to read and print links from the file
def read_links(file_path):
    with open(file_path, 'r') as file:
        links = file.readlines()
        links = [link.strip() for link in links]
    return links

file_path = '../Data/over_list.txt'
# Access the links from the file
over_list = read_links(file_path)
print(f"Links read from file {file_path}")

Links read from file ../Data/over_list.txt


In [None]:
def extract_scores(url_list):
    all_scores = []  # List to store scores for each URL

    for url in url_list:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        except requests.RequestException as e:
            print(f'Failed to retrieve the webpage at {url}. Error: {e}')
            continue

        soup = bs(response.content, 'html.parser')

        # Find all game divs
        game_divs = soup.find_all('div', class_='vm-stats-game')

        for game_div in game_divs:
            game_header = game_div.find('div', class_='vm-stats-game-header')
            if game_header:
                team_divs = game_header.find_all('div', class_='team')

                # Initialize lists to hold team names and scores
                team_names = []
                team_scores = []

                for div_team in team_divs:
                    team_name = div_team.find('div', class_='team-name').text.strip()
                    team_score = div_team.find('div', class_='score').text.strip()

                    # Clean the extracted text
                    team_name = re.sub(r'\n\t*', '', team_name)
                    team_name = re.sub(r'\t*', '', team_name)
                    try:
                        team_score = int(team_score)
                    except ValueError:
                        print(f"Failed to convert score to integer for team {team_name}. Score: {team_score}")
                        continue

                    # Append to the lists
                    team_names.append(team_name)
                    team_scores.append(team_score)

                # Extract map name and duration
                map_name = game_header.find('div', class_='map').find('span').text.strip()
                map_duration = game_header.find('div', class_='map-duration').text.strip()

                # Store the extracted scores for each map
                if len(team_names) == 2 and len(team_scores) == 2:
                    map_scores = {
                        'team_left': team_names[0],
                        'score_left': team_scores[0],
                        'team_right': team_names[1],
                        'score_right': team_scores[1],
                        'map_name': map_name,
                        'map_duration': map_duration,
                    }
                    all_scores.append(map_scores)

    return all_scores

In [3]:
def clean_text(text):
    # Remove excessive whitespace and special characters
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Remove any extra tags like "PICK" or others
    text = re.sub(r'\s*PICK\s*', '', text, flags=re.IGNORECASE)
    
    return text

def extract_scores(url_list):
    all_scores = []  # List to store scores for each URL

    for url in url_list:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        except requests.RequestException as e:
            print(f'Failed to retrieve the webpage at {url}. Error: {e}')
            continue

        soup = bs(response.content, 'html.parser')

        # Find all game divs
        game_divs = soup.find_all('div', class_='vm-stats-game')

        for game_div in game_divs:
            game_header = game_div.find('div', class_='vm-stats-game-header')
            if game_header:
                team_divs = game_header.find_all('div', class_='team')

                # Initialize lists to hold team names and scores
                team_names = []
                team_scores = []

                for div_team in team_divs:
                    team_name = div_team.find('div', class_='team-name').text.strip()
                    team_score = div_team.find('div', class_='score').text.strip()

                    # Clean the extracted text
                    team_name = clean_text(team_name)
                    try:
                        team_score = int(team_score)
                    except ValueError:
                        print(f"Failed to convert score to integer for team {team_name}. Score: {team_score}")
                        continue

                    # Append to the lists
                    team_names.append(team_name)
                    team_scores.append(team_score)

                # Extract map name and duration
                map_div = game_header.find('div', class_='map')
                if map_div:
                    map_name = map_div.find('span').text.strip()
                    map_name = clean_text(map_name)
                else:
                    map_name = "Unknown"

                map_duration_div = game_header.find('div', class_='map-duration')
                if map_duration_div:
                    map_duration = map_duration_div.text.strip()
                    map_duration = clean_text(map_duration)
                else:
                    map_duration = "Unknown"

                # Store the extracted scores for each map
                if len(team_names) == 2 and len(team_scores) == 2:
                    map_scores = {
                        'team_left': team_names[0],
                        'score_left': team_scores[0],
                        'team_right': team_names[1],
                        'score_right': team_scores[1],
                        'map_name': map_name,
                        'map_duration': map_duration,
                    }
                    all_scores.append(map_scores)

    return all_scores

In [4]:
# Extract scores
scores = extract_scores(over_list)

# Convert the list of dictionaries to a DataFrame
scores_df = pd.DataFrame(scores)
scores_df.head(20)

Unnamed: 0,team_left,score_left,team_right,score_right,map_name,map_duration
0,MIBR,9,Leviatán,13,Ascent,1:05:24
1,MIBR,7,Leviatán,13,Icebox,45:04
2,Sentinels,13,NRG Esports,8,Lotus,59:21
3,Sentinels,14,NRG Esports,12,Sunset,1:00:54
4,FURIA,14,100 Thieves,12,Icebox,1:12:51
5,FURIA,16,100 Thieves,14,Haven,1:00:50
6,LOUD,13,Evil Geniuses,7,Lotus,52:39
7,LOUD,6,Evil Geniuses,13,Sunset,40:31
8,LOUD,11,Evil Geniuses,13,Icebox,55:13
9,G2 Esports,13,Cloud9,11,Ascent,1:05:05


In [5]:
# Add the game_id column
scores_df['game_id'] = range(len(scores_df))

In [6]:
scores_df

Unnamed: 0,team_left,score_left,team_right,score_right,map_name,map_duration,game_id
0,MIBR,9,Leviatán,13,Ascent,1:05:24,0
1,MIBR,7,Leviatán,13,Icebox,45:04,1
2,Sentinels,13,NRG Esports,8,Lotus,59:21,2
3,Sentinels,14,NRG Esports,12,Sunset,1:00:54,3
4,FURIA,14,100 Thieves,12,Icebox,1:12:51,4
...,...,...,...,...,...,...,...
58,LOUD,0,100 Thieves,0,TBD,-,58
59,LOUD,0,100 Thieves,0,TBD,-,59
60,Sentinels,0,G2 Esports,0,TBD,-,60
61,Sentinels,0,G2 Esports,0,TBD,-,61


In [7]:
scores_df = scores_df.rename(columns={
    'team_left': 'home_team',
    'score_left': 'home_score',
    'team_right': 'away_team',
    'score_right': 'away_score',
    'game_id': 'game_id',
    'map_name': 'map_name',
    'map_duration': 'map_duration'
})

In [8]:
new_column_order = ['game_id', 'map_name', 'home_team', 'away_team', 'map_duration', 'home_score', 'away_score']
    
# Reorder the columns
scores_df = scores_df[new_column_order]

In [9]:
print(scores_df['home_team'].unique())
print(scores_df['away_team'].unique())

['MIBR' 'Sentinels' 'FURIA' 'LOUD' 'G2 Esports' 'Evil Geniuses' 'Leviatán'
 'NRG Esports' 'Cloud9']
['Leviatán' 'NRG Esports' '100 Thieves' 'Evil Geniuses' 'Cloud9' 'FURIA'
 'KRÜ Esports' 'MIBR' 'G2 Esports']


In [10]:
replace_data = {'team_name': ['MIBR', 'Leviatán', 'Sentinels', 'NRG Esports', 'FURIA', '100 Thieves', 'LOUD', 'Evil Geniuses', 
                         'G2 Esports', 'Cloud9', 'KRÜ Esports'],
                'team_abrev': ['MIBR', 'LEV', 'SEN', 'NRG', 'FUR', '100T', 'LOUD', 'EG', 'G2', 'C9', 'KRÜ']}

replace_df = pd.DataFrame(replace_data)

In [11]:
# Replace names in df using replace_df
for index, row in replace_df.iterrows():
    scores_df['home_team'] = scores_df['home_team'].replace(row['team_name'], row['team_abrev'])
for index, row in replace_df.iterrows():
    scores_df['away_team'] = scores_df['away_team'].replace(row['team_name'], row['team_abrev'])

scores_df

Unnamed: 0,game_id,map_name,home_team,away_team,map_duration,home_score,away_score
0,0,Ascent,MIBR,LEV,1:05:24,9,13
1,1,Icebox,MIBR,LEV,45:04,7,13
2,2,Lotus,SEN,NRG,59:21,13,8
3,3,Sunset,SEN,NRG,1:00:54,14,12
4,4,Icebox,FUR,100T,1:12:51,14,12
...,...,...,...,...,...,...,...
58,58,TBD,LOUD,100T,-,0,0
59,59,TBD,LOUD,100T,-,0,0
60,60,TBD,SEN,G2,-,0,0
61,61,TBD,SEN,G2,-,0,0


In [12]:
# Save the DataFrame to a CSV file
scores_df.to_csv('../Data/scores_data.csv', index=False)

print('DataFrame saved to scores_data.csv')

DataFrame saved to scores_data.csv
