## Import Libraries

In [73]:
import requests
import json
import pandas as pd
import numpy as np
import time
import random

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

## Set constant variables

In [2]:
base_url = 'https://csgostats.gg/'
leaderboards_url = 'https://csgostats.gg/leaderboards'

## Get list of ranks and the urls for each rank leaderboard.

### Using Requests (can't scrape due to captcha)

In [12]:
result = requests.get(leaderboards_url)
soup = BeautifulSoup(result.text, 'html.parser')
soup

### Using selenium (success)

In [3]:
username = 'chris'

# webdriver options
chromeOptions = Options()
# chromeOptions.add_argument('--kiosk')                          # sets the headless browser into full screen mode
# chromeOptions.add_argument('--headless')                       # opens the browser silently (hides it, if you enable this, make sure to disable kiosk)
chromeOptions.add_argument('--log-level=3')                    # stops the headless browser's logging features
# chromeOptions.add_argument('blink-settings=imagesEnabled=false') # set loading images to be false (for faster loading)
chromeOptions.add_argument('--no-sandbox')                     # required when running as root user. otherwise you would get no sandbox errors. 
chromeOptions.add_argument('--disable-extensions')
chromeOptions.add_argument('--disable-gpu')
chromeOptions.page_load_strategy = 'normal'

# to prevent getting a captcha, make it look like that the bot is a human
chromeOptions.add_argument("--profile-directory=Default")
chromeOptions.add_argument("--user-data-dir=C:/Users/%s/AppData/Local/Google/Chrome/User Data" % username)
chromeOptions.add_argument('--disable-blink-features=AutomationControlled')

chromeOptions.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")

# Run the Driver
driver = webdriver.Chrome('./chromedriver/chromedriver.exe', options=chromeOptions, service_args=['--verbose'])

  driver = webdriver.Chrome('./chromedriver/chromedriver.exe', options=chromeOptions, service_args=['--verbose'])


In [51]:
# Close the driver
driver.close()

In [95]:
# Get the list of ranks and the corresponding leaderboards in that rank
ranks = {
    'Rank': [],
    'URL': [],
}

# Go to Leaderboards page
driver.get(leaderboards_url)

# Loop the competitive rank images
for li in driver.find_elements_by_tag_name('ul')[5].find_elements_by_tag_name('li'):
    rank_name = li.find_element_by_tag_name('img').get_attribute('alt')
    rank_url = li.find_element_by_tag_name('a').get_attribute('href')
    
    print(rank_name, '|', rank_url)
    
    ranks['Rank'].append(rank_name)
    ranks['URL'].append(rank_url)

  for li in driver.find_elements_by_tag_name('ul')[5].find_elements_by_tag_name('li'):
  rank_name = li.find_element_by_tag_name('img').get_attribute('alt')
  rank_url = li.find_element_by_tag_name('a').get_attribute('href')


Silver I | https://csgostats.gg/leaderboards/1
Silver II | https://csgostats.gg/leaderboards/2
Silver III | https://csgostats.gg/leaderboards/3
Silver IV | https://csgostats.gg/leaderboards/4
Silver Elite | https://csgostats.gg/leaderboards/5
Silver Elite Master | https://csgostats.gg/leaderboards/6
Gold Nova I | https://csgostats.gg/leaderboards/7
Gold Nova II | https://csgostats.gg/leaderboards/8
Gold Nova III | https://csgostats.gg/leaderboards/9
Gold Nova Master | https://csgostats.gg/leaderboards/10
Master Guardian I | https://csgostats.gg/leaderboards/11
Master Guardian II | https://csgostats.gg/leaderboards/12
Master Guardian Elite | https://csgostats.gg/leaderboards/13
Distinguished Master Guardian | https://csgostats.gg/leaderboards/14
Legendary Eagle | https://csgostats.gg/leaderboards/15
Legendary Eagle Master | https://csgostats.gg/leaderboards/16
Supreme Master First Class | https://csgostats.gg/leaderboards/17
The Global Elite | https://csgostats.gg/leaderboards/18


In [96]:
ranks_df = pd.DataFrame(ranks)
ranks_df.head()

Unnamed: 0,Rank,URL
0,Silver I,https://csgostats.gg/leaderboards/1
1,Silver II,https://csgostats.gg/leaderboards/2
2,Silver III,https://csgostats.gg/leaderboards/3
3,Silver IV,https://csgostats.gg/leaderboards/4
4,Silver Elite,https://csgostats.gg/leaderboards/5


In [98]:
ranks_df.to_json('ranks.json', orient='records')

## Get all the players and their stats for each rank

The scrape started from 06/16/22 5PM and ended at 06/16/22 __PM.

In [100]:
ranks_df = pd.read_json('ranks.json')
ranks_df.head()

Unnamed: 0,Rank,URL
0,Silver I,https://csgostats.gg/leaderboards/1
1,Silver II,https://csgostats.gg/leaderboards/2
2,Silver III,https://csgostats.gg/leaderboards/3
3,Silver IV,https://csgostats.gg/leaderboards/4
4,Silver Elite,https://csgostats.gg/leaderboards/5


In [None]:
players_data = {
    'Competitive Rank': [],
    'Leaderboards Rank': [],
    'Username': [],
    'Primary Weapon': [],
    'Secondary Weapon': [],
    'K/D Score': [],
    'K/D Ratio': [],
    'HS %': [],
    'Win Rate': [],
    '1vX': [],
    'Rating': []
}

for _, data in list(ranks_df.iterrows())[3:]:
    page = 1
    print(data['Rank'])
    while(True):
        # Get the URL passing the page query
        driver.get(data['URL'] + ('?page=%d' % page))
        
        # Pass the html source of the result to beautiful soup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Get all the players using class global-lb and recursive=False
        players = soup.find('div', class_='global-lb').find_all('div', recursive=False)[3].find_all('div', recursive=False)[:-1]
        print('Page %d: %d players' % (page, len(players)))
        
        if(len(players) == 0):
            break
        
        for div in players:
            # Get all top layer divs 
            # Note: there are no classes or ids that could select each players
            divs = div.find_all('div', recursive=False)
            if(len(divs) > 0): # if 
                rank = divs[0].text.strip().replace('#', '')
                username = divs[1].text.strip()
                primary = divs[2].find('img').get('title')
                secondary = divs[3].find('img').get('title')
                kd_score, kd_ratio = divs[4].text.strip().split('\n')
                hs_percent = divs[5].text.strip()
                win_rate = divs[6].text.strip()
                onevx = divs[7].text.strip()
                rating = divs[8].text.strip()

                players_data['Competitive Rank'].append(data['Rank'])
                players_data['Leaderboards Rank'].append(rank)
                players_data['Username'].append(username)
                players_data['Primary Weapon'].append(primary)
                players_data['Secondary Weapon'].append(secondary)
                players_data['K/D Score'].append(kd_score)
                players_data['K/D Ratio'].append(kd_ratio)
                players_data['HS %'].append(hs_percent)
                players_data['Win Rate'].append(win_rate)
                players_data['1vX'].append(onevx)
                players_data['Rating'].append(rating)

                # print('[', rank, ']', username, '|', primary, '|', secondary, '|', kd_score, '|', kd_ratio, '|', hs_percent, '|', win_rate, '|', onevx, '|', rating)
        
        page += 1
        
        # Add random delay to prevent excessive page request
        time.sleep(random.random() * 0.75 + 0.5)

Silver IV
Page 1: 100 players
Page 2: 100 players
Page 3: 100 players
Page 4: 100 players
Page 5: 100 players
Page 6: 100 players
Page 7: 100 players
Page 8: 100 players
Page 9: 100 players
Page 10: 100 players
Page 11: 100 players
Page 12: 100 players
Page 13: 100 players
Page 14: 100 players
Page 15: 100 players
Page 16: 100 players
Page 17: 100 players
Page 18: 100 players
Page 19: 100 players
Page 20: 100 players
Page 21: 100 players
Page 22: 100 players
Page 23: 100 players
Page 24: 100 players
Page 25: 100 players
Page 26: 100 players
Page 27: 100 players
Page 28: 100 players
Page 29: 100 players
Page 30: 100 players
Page 31: 100 players
Page 32: 100 players
Page 33: 100 players
Page 34: 100 players
Page 35: 100 players
Page 36: 100 players
Page 37: 100 players
Page 38: 100 players
Page 39: 100 players
Page 40: 100 players
Page 41: 100 players
Page 42: 100 players
Page 43: 100 players
Page 44: 100 players
Page 45: 100 players
Page 46: 100 players
Page 47: 100 players
Page 48: 100

Page 86: 100 players
Page 87: 100 players
Page 88: 100 players
Page 89: 100 players
Page 90: 100 players
Page 91: 100 players
Page 92: 100 players
Page 93: 100 players
Page 94: 100 players
Page 95: 100 players
Page 96: 100 players
Page 97: 100 players
Page 98: 100 players
Page 99: 100 players
Page 100: 100 players
Page 101: 100 players
Page 102: 100 players
Page 103: 100 players
Page 104: 100 players
Page 105: 100 players
Page 106: 100 players
Page 107: 100 players
Page 108: 100 players
Page 109: 100 players
Page 110: 100 players
Page 111: 100 players
Page 112: 100 players
Page 113: 100 players
Page 114: 100 players
Page 115: 100 players
Page 116: 100 players
Page 117: 100 players
Page 118: 100 players
Page 119: 100 players
Page 120: 100 players
Page 121: 100 players
Page 122: 100 players
Page 123: 100 players
Page 124: 100 players
Page 125: 100 players
Page 126: 100 players
Page 127: 100 players
Page 128: 100 players
Page 129: 100 players
Page 130: 100 players
Page 131: 100 players
Pa

In [111]:
players_df = pd.DataFrame(players_data)
players_df.head()

Unnamed: 0,Competitive Rank,Leaderboards Rank,Username,Primary Weapon,Secondary Weapon,K/D Score,K/D Ratio,HS %,Win Rate,1vX,Rating
0,Silver III,1,BLACK KEVIN,awp,ak47,2.19,199 / 91,27%,70%,5,2.32
1,Silver III,2,Royal Timmy,m4a1_silencer,awp,2.72,307 / 113,47%,90%,9,2.25
2,Silver III,3,✞⁧⁧⁧⁧secret,m4a1_silencer,ak47,2.47,178 / 72,66%,80%,10,2.17
3,Silver III,4,Danny Delete-O,ak47,awp,2.28,296 / 130,49%,50%,6,2.09
4,Silver III,5,Masefx,m4a1_silencer,ak47,2.23,172 / 77,41%,60%,6,2.08


In [112]:
len(players_df)

14468

In [113]:
players_df.to_json('s4-sem_players.json', orient='records')

In [114]:
players_df = pd.read_json('s4-sem_players.json')
players_df.head()

Unnamed: 0,Competitive Rank,Leaderboards Rank,Username,Primary Weapon,Secondary Weapon,K/D Score,K/D Ratio,HS %,Win Rate,1vX,Rating
0,Silver III,1,BLACK KEVIN,awp,ak47,2.19,199 / 91,27%,70%,5,2.32
1,Silver III,2,Royal Timmy,m4a1_silencer,awp,2.72,307 / 113,47%,90%,9,2.25
2,Silver III,3,✞⁧⁧⁧⁧secret,m4a1_silencer,ak47,2.47,178 / 72,66%,80%,10,2.17
3,Silver III,4,Danny Delete-O,ak47,awp,2.28,296 / 130,49%,50%,6,2.09
4,Silver III,5,Masefx,m4a1_silencer,ak47,2.23,172 / 77,41%,60%,6,2.08
