In [48]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
import lxml
import re
import numpy as np

This saves HLTV html pages of a certain year's players under all LAN events. 

In [49]:
# def oneYear(year):
#     return 'startDate=' + str(year) + '-01-01&endDate=' + str(year) + '-12-31'


# for year in np.arange(2013,2020):
#     url = 'https://www.hltv.org/stats/players?' + oneYear(year) + '&matchType=Lan'
#     temp = requests.get(url).text
#     with open('players' + str(year) + '.html', 'w', encoding='utf-8') as f:
#         f.write(temp)

Filtering process

In [37]:
def allplayersonLAN(year):
    with open('players' + str(year) + '.html','r',encoding='utf-8') as players_html:
        soup = BeautifulSoup(players_html,'lxml')
    temp = {}
    for player in soup.find_all('td',class_='playerCol'):
        temp[player.text] = 'https://hltv.org' + player.find('a')['href']
    return temp

In [34]:
def againstTop50(row):
    name = row.name
    url = row.Webpage
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    for i in soup.find_all('div',class_='col-custom'):
        if i.text.find('top 50 opponents') > -1:
            mapsplayed = i.find('div',class_='rating-maps').text.split()[0].split('(')[1]
            if int(mapsplayed) >= 20:
                return True
    time.sleep(0.5)
    return False


This function combines the two above to filter out players that may have not participated in enough important events.

In [1]:
def exportFilteredPlayers(year):  
    tab = pd.DataFrame.from_dict(allplayersonLAN(year),orient='index',columns=['Webpage'])
    tab.insert(tab.shape[1],'AgainstTop50',tab.apply(againstTop50,axis=1))
    tab[tab.AgainstTop50].to_csv('players' + str(year) + '.csv')
    return

This function obtains the HLTV's own top 20 players. 

In [87]:
def getTop20(year, index):
    url = 'https://www.hltv.org/news/' + str(index) + '/top-20-players-of-' + str(year) + '-introduction'
    src = requests.get(url).text
    soup = BeautifulSoup(src,'lxml')
    arr = np.array([])
    
    if year in [2016, 2017]:
        for i in soup.find_all('tr'):
            arr = np.concatenate((arr,[i.text.split()[2][1:-1]]))
    elif year in [2018, 2019]:
        top20 = soup.find_all('blockquote')[1].text.strip()
        top20list = re.compile("[0-9]+\.  ").split(top20)
        for player in top20list:
            if not len(player) <= 1:
                arr = np.concatenate((arr,[player.split('"')[1]]))
    else:
        print('Error')
    return arr

def aggregateTop20(temp):
    top20 = np.array([])
    for year in temp:
        top20 = np.concatenate((top20,getTop20(year, temp[year])))   
    df = pd.DataFrame(top20.reshape(4,20).swapaxes(0,1), columns=list(temp.keys()))
    return df

hltvTop20Index = {2016: 19558, 2017: 22348, 2018: 25735, 2019: 28749}
top20DF = aggregateTop20(hltvTop20Index)
top20DF

In [126]:

def checkIfTop20(row,top20):
    name = row.name
    if top20.str.contains(name,regex=False).sum() > 0:
        return True
    else:    
        return False


df = pd.read_csv('filteredplayers/players2019.csv',index_col=0).drop('AgainstTop50',axis=1)
df.insert(df.shape[1],'HLTV Top 20',df.apply(checkIfTop20,axis=1,args=(top20DF.loc[:,2019],)))
df


Unnamed: 0,Webpage,HLTV Top 20
ZywOo,https://hltv.org/stats/players/11893/ZywOo?sta...,True
s1mple,https://hltv.org/stats/players/7998/s1mple?sta...,True
device,https://hltv.org/stats/players/7592/device?sta...,True
xsepower,https://hltv.org/stats/players/12733/xsepower?...,False
EliGE,https://hltv.org/stats/players/8738/EliGE?star...,True
...,...,...
daps,https://hltv.org/stats/players/8521/daps?start...,False
tiziaN,https://hltv.org/stats/players/5796/tiziaN?sta...,False
HUNDEN,https://hltv.org/stats/players/7415/HUNDEN?sta...,False
advent,https://hltv.org/stats/players/8600/advent?sta...,False


possible features
- ADR
- KPR
- DPR
- APR
- IMPACT
- KAST
- RATING
- CONSISTENCY
- k-d diff
- % of maps with 1+ rating
- HS%