In [1]:
# import dependencies
from bs4 import BeautifulSoup as bs
import requests
import re
import timeit
import pandas as pd

In [2]:
# Function to read and print links from the file
def read_links(file_path):
    with open(file_path, 'r') as file:
        links = file.readlines()
        links = [link.strip() for link in links]
    return links

file_path = '../Data/over_list.txt'
# Access the links from the file
over_list = read_links(file_path)
print(f"Links read from file {file_path}")

Links read from file ../Data/over_list.txt


In [3]:
def extract_scores(url_list):
    all_scores = []  # List to store scores for each URL

    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')

            # Find all game divs
            game_divs = soup.find_all('div', class_='vm-stats-game')

            game_id = 0
            for game_div in game_divs:
                game_header = game_div.find('div', class_='vm-stats-game-header')
                if game_header:
                    team_divs = game_header.find_all('div', class_='team')

                    # Initialize lists to hold team names and scores
                    team_names = []
                    team_scores = []

                    for div_team in team_divs:
                        team_name = div_team.find('div', class_='team-name').text.strip()
                        team_score = div_team.find('div', class_='score').text.strip()

                        # Clean the extracted text
                        team_name = re.sub(r'\n\t*', '', team_name)
                        team_name = re.sub(r'\t*', '', team_name)
                        team_score = int(team_score)

                        # Append to the lists
                        team_names.append(team_name)
                        team_scores.append(team_score)

                    # Store the extracted scores for each map
                    if len(team_names) == 2 and len(team_scores) == 2:
                        map_scores = {
                            'team_left': team_names[0],
                            'score_left': team_scores[0],
                            'team_right': team_names[1],
                            'score_right': team_scores[1],
                        }
                        all_scores.append(map_scores)
        else:
            print('Failed to retrieve the webpage. Status code:', response.status_code)

    return all_scores

In [4]:
# Extract scores
scores = extract_scores(over_list)

# Convert the list of dictionaries to a DataFrame
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,team_left,score_left,team_right,score_right
0,MIBR,9,Leviatán,13
1,MIBR,7,Leviatán,13
2,Sentinels,13,NRG Esports,8
3,Sentinels,14,NRG Esports,12
4,FURIA,14,100 Thieves,12
...,...,...,...,...
58,LOUD,0,100 Thieves,0
59,LOUD,0,100 Thieves,0
60,Sentinels,0,G2 Esports,0
61,Sentinels,0,G2 Esports,0


In [5]:
# Add the game_id column
scores_df['game_id'] = range(len(scores_df))

In [6]:
scores_df

Unnamed: 0,team_left,score_left,team_right,score_right,game_id
0,MIBR,9,Leviatán,13,0
1,MIBR,7,Leviatán,13,1
2,Sentinels,13,NRG Esports,8,2
3,Sentinels,14,NRG Esports,12,3
4,FURIA,14,100 Thieves,12,4
...,...,...,...,...,...
58,LOUD,0,100 Thieves,0,58
59,LOUD,0,100 Thieves,0,59
60,Sentinels,0,G2 Esports,0,60
61,Sentinels,0,G2 Esports,0,61


In [7]:
scores_df = scores_df.rename(columns={
    'team_left': 'home_team',
    'score_left': 'home_score',
    'team_right': 'away_team',
    'score_right': 'away_score',
    'game_id': 'game_id'
})

In [8]:
new_column_order = ['game_id', 'home_team', 'away_team', 'home_score', 'away_score']
    
# Reorder the columns
scores_df = scores_df[new_column_order]

In [9]:
# Save the DataFrame to a CSV file
scores_df.to_csv('../Data/scores_data.csv', index=False)

print('DataFrame saved to scores_data.csv')

DataFrame saved to scores_data.csv
