In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
import lxml
import re
import numpy as np

This saves HLTV html pages of a certain year's players under all LAN events. 

In [49]:
# def oneYear(year):
#     return 'startDate=' + str(year) + '-01-01&endDate=' + str(year) + '-12-31'


# for year in np.arange(2013,2020):
#     url = 'https://www.hltv.org/stats/players?' + oneYear(year) + '&matchType=Lan'
#     temp = requests.get(url).text
#     with open('players' + str(year) + '.html', 'w', encoding='utf-8') as f:
#         f.write(temp)

Filtering process

In [37]:
def allplayersonLAN(year):
    with open('players' + str(year) + '.html','r',encoding='utf-8') as players_html:
        soup = BeautifulSoup(players_html,'lxml')
    temp = {}
    for player in soup.find_all('td',class_='playerCol'):
        temp[player.text] = 'https://hltv.org' + player.find('a')['href']
    return temp

In [34]:
def againstTop50(row):
    name = row.name
    url = row.Webpage
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    for i in soup.find_all('div',class_='col-custom'):
        if i.text.find('top 50 opponents') > -1:
            mapsplayed = i.find('div',class_='rating-maps').text.split()[0].split('(')[1]
            if int(mapsplayed) >= 20:
                return True
    time.sleep(0.5)
    return False


This function combines the two above to filter out players that may have not participated in enough important events.

In [1]:
def exportFilteredPlayers(year):  
    tab = pd.DataFrame.from_dict(allplayersonLAN(year),orient='index',columns=['Webpage'])
    tab.insert(tab.shape[1],'AgainstTop50',tab.apply(againstTop50,axis=1))
    tab[tab.AgainstTop50].to_csv('players' + str(year) + '.csv')
    return

This function obtains the HLTV's own top 20 players. 

In [2]:
def getTop20(year, index):
    url = 'https://www.hltv.org/news/' + str(index) + '/top-20-players-of-' + str(year) + '-introduction'
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    arr = np.array([])
    
    if year in [2016, 2017]:
        for i in soup.find_all('tr'):
            arr = np.concatenate((arr,[i.text.split()[2][1:-1]]))
    elif year in [2018, 2019]:
        top20 = soup.find_all('blockquote')[1].text.strip()
        top20list = re.compile("[0-9]+\.  ").split(top20)
        for player in top20list:
            if not len(player) <= 1:
                arr = np.concatenate((arr,[player.split('"')[1]]))
    else:
        print('Error')
    return arr

def aggregateTop20(temp):
    top20 = np.array([])
    for year in temp:
        top20 = np.concatenate((top20,getTop20(year, temp[year])))   
    df = pd.DataFrame(top20.reshape(4,20).swapaxes(0,1), columns=list(temp.keys()))
    return df

hltvTop20Index = {2016: 19558, 2017: 22348, 2018: 25735, 2019: 28749}
top20DF = aggregateTop20(hltvTop20Index)
top20DF

Unnamed: 0,2016,2017,2018,2019
0,coldzera,coldzera,s1mple,ZywOo
1,FalleN,NiKo,device,s1mple
2,device,fer,NiKo,device
3,s1mple,rain,electronic,EliGE
4,Snax,device,dupreeh,Magisk
5,shox,FalleN,NAF,electronic
6,f0rest,kennyS,Magisk,NAF
7,olofmeister,s1mple,gla1ve,Brehze
8,ScreaM,GuardiaN,KRIMZ,Twistzz
9,flusha,dupreeh,coldzera,ropz


In [3]:
def checkIfTop20(row,top20):
    name = row.name
    if top20.str.contains(name,regex=False).sum() > 0:
        return True
    else:    
        return False

df = pd.read_csv('filteredplayers/players2019.csv',index_col=0).drop('AgainstTop50',axis=1)
df.insert(df.shape[1],'HLTV Top 20',df.apply(checkIfTop20,axis=1,args=(top20DF.loc[:,2019],)))
df

Unnamed: 0,Webpage,HLTV Top 20
ZywOo,https://hltv.org/stats/players/11893/ZywOo?sta...,True
s1mple,https://hltv.org/stats/players/7998/s1mple?sta...,True
device,https://hltv.org/stats/players/7592/device?sta...,True
xsepower,https://hltv.org/stats/players/12733/xsepower?...,False
EliGE,https://hltv.org/stats/players/8738/EliGE?star...,True
...,...,...
daps,https://hltv.org/stats/players/8521/daps?start...,False
tiziaN,https://hltv.org/stats/players/5796/tiziaN?sta...,False
HUNDEN,https://hltv.org/stats/players/7415/HUNDEN?sta...,False
advent,https://hltv.org/stats/players/8600/advent?sta...,False


In [46]:
def getFeatures(row):
    
    return 

#url = 'https://www.hltv.org/stats/players/11893/ZywOo?startDate=2019-01-01&endDate=2019-12-31&matchType=Lan'
url = 'https://www.hltv.org/stats/players/7938/XANTARES?startDate=2016-01-01&endDate=2016-12-31&matchType=Lan'
url2 = 'https://www.hltv.org/stats/players/individual/11893/ZywOo?startDate=2019-01-01&endDate=2019-12-31&matchType=Lan'
url3 = 'https://www.hltv.org/stats/players/matches/11893/ZywOo?startDate=2019-01-01&endDate=2019-12-31&matchType=Lan'



In [75]:
def getScaledRating(sp):
    ratingsTab = sp.find('div',class_='featured-ratings-container')
    scaledrating = 0
    for rating in ratingsTab.find_all('div',class_='rating-breakdown'):
        val = 0
        ratingtype = rating.find('div',class_='rating-description').text
        mapcount = int(rating.find('div',class_='rating-maps').text[1:-1].split()[0])
        if mapcount == 0:
            continue 
        
        temp = float(rating.find('div',class_='rating-value').text)
        
        if ratingtype == 'vs top 5 opponents':
            val = temp * 0.400
        elif ratingtype== 'vs top 10 opponents':
            val = temp * 0.300
        elif ratingtype== 'vs top 20 opponents':
            val = temp * 0.200  
        elif ratingtype== 'vs top 30 opponents':
            val = temp * 0.075
        elif ratingtype== 'vs top 50 opponents':
            val = temp * 0.025  
        
        
        if ratingtype == 'vs top 5 opponents' and mapcount < 10:
            val *= 0.75
        elif ratingtype== 'vs top 10 opponents' and mapcount < 20:
            val *= 0.75
        elif ratingtype== 'vs top 20 opponents' and mapcount < 40:
            val *= 0.75       
        scaledrating += val

    return scaledrating


def getFromOverview(url, arr):
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    statsIndex = {'Rating': 0, 'ADR': 1, 'KPR': 2, 'DPR': 3, 
                  'Assists / round': 4, 'Impact': 5, 'KAST': 6, 'Grenade dmg / Round': 7}
    
    # ADR, KPR, DPR, KAST, IMPACT
    for i in soup.find_all('div',class_=re.compile('summaryStatBreakdown ')):
        statname = i.find('div',class_='summaryStatBreakdownSubHeader').text.split()[0]
        if statname in statsIndex:
            if statname == 'KAST':
                arr[statsIndex[statname]] = i.find('div',class_='summaryStatBreakdownDataValue').text[:-1]
            else:
                arr[statsIndex[statname]] = i.find('div',class_='summaryStatBreakdownDataValue').text
    
    # APR, Grenade Dmg/ Round
    for i in soup.find_all('div',class_='stats-row'):
        if i.text.find('Grenade dmg / Round') > -1:
            arr[statsIndex['Grenade dmg / Round']] = i.find_all('span')[1].text
        elif i.text.find('Assists / round') > -1:
            arr[statsIndex['Assists / round']] = i.find_all('span')[1].text
            
    # Rating Scale
    arr[statsIndex['Rating']] = getScaledRating(soup)
    return arr


# 'Maps with 1+ rating'

# 'Kill - Death difference', 
#                                   'Team win percent after first kill', 
#                                   'Opening kill ratio',
statsArr = np.zeros((15,))
getFromOverview(url,statsArr)

array([  0.3555, 104.3   ,   0.95  ,   0.67  ,   0.2   ,   1.7   ,
        76.7   ,   2.1   ,   0.    ,   0.    ,   0.    ,   0.    ,
         0.    ,   0.    ,   0.    ])

In [76]:
def getFromMatches(url, arr):
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    
    return arr


def getFromIndividual(url, arr):
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    statsIndex = {'Kill - Death difference': 8, 'Opening kill ratio': 9, 'Team win percent after first kill': 10}
    for i in soup.find_all('div',class_='stats-row'):
        statname = i.span.text
        if statname in statsIndex:
            if statname == 'Kill - Death difference':
                arr[statsIndex[statname]] = i.span.next_sibling.next_sibling.text
            elif statname == 'Team win percent after first kill':
                arr[statsIndex[statname]] = i.span.next_sibling.text[:-1]
            else:
                arr[statsIndex[statname]] = i.span.next_sibling.text
                
    return arr

getFromIndividual(url2,statsArr)

array([3.555e-01, 1.043e+02, 9.500e-01, 6.700e-01, 2.000e-01, 1.700e+00,
       7.670e+01, 2.100e+00, 1.097e+03, 1.590e+00, 7.460e+01, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00])

possible features
0. Rating
1. ADR 
2. KPR 
3. DPR 
4. APR
5. IMPACT (done)
6. KAST (done)
7. Grenade dmg
8. k-d diff
9. Opening kill ratio
10. team win percent after 1st kill
11. % of maps with 1+ rating
12. HS%