In [146]:
# Import all packages

import bs4    # BeautifulSoup is used to scrape websites by parsing the HTML
import requests # Requests is used to make HTTP requests
import pandas as pd # Pandas is used to create "dataframes" which are used for data analysis
import copy

In [147]:
# Making a request to the URL to get the entire HTML of the page

URL="https://www.hltv.org/stats/players" # URL of the page to be scraped
HEADER = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'} # This is used to prevent the website from blocking the scraper
res = requests.get(URL, headers=HEADER) # This is the request to the URL with the headers passed in as a parameter
res.status_code # This will return the status code of the request (200 if okay)

200

In [148]:
# Parsing the HTML of the page

text=res.content # This is the HTML of the page
soup= bs4.BeautifulSoup(text) # This is the BeautifulSoup object that will be used to parse the HTML
soup.title.text, soup.h1.text # This will return the title and h1 of the page

('CS:GO Player statistics database | HLTV.org', 'RECENT ACTIVITY')

In [149]:
# Checking total number of tables to be scraped

player_table= soup.find_all('table', attrs={"class": "stats-table player-ratings-table"}) # This is the table that contains the player data
len(player_table) # This will return the number of tables that were found
player_table=player_table[0] # This is the first and only table in the list of tables

In [150]:
# Dividing the tale

header= player_table.find("thead") # This is the header of the table
details= player_table.find("tbody") # This is the body of the table

In [151]:
# Checking out the header to see structure and number of columns in the
header

<thead>
<tr class="stats-table-row">
<th class="playerCol">Player</th>
<th class="teamCol">Teams</th>
<th class="mapsCol">Maps</th>
<th class="rounds-col gtSmartphone-only">Rounds</th>
<th class="kdDiffCol">K-D Diff</th>
<th class="kdCol">K/D</th>
<th class="ratingCol">Rating<span class="ratingDesc">1.0</span></th>
</tr>
</thead>

In [152]:
# We will look for all table rows in details and then look for all table cells in each row

details= details.find_all("tr") # This is the list of table rows

In [153]:
# Define the columns of the dataframe with meaningful names

# Finding the headers of the table
headers = player_table.find('thead').find_all('th')
# Renaming the headers
headers[0].string = 'Player'
headers[1].string = 'Teams'
headers[2].string = 'Maps'
headers[3].string = 'Rounds'
headers[4].string = 'K-D Diff'
headers[5].string = 'K/D'
headers[6].string = 'Rating (1.0)'

headers

[<th class="playerCol">Player</th>,
 <th class="teamCol">Teams</th>,
 <th class="mapsCol">Maps</th>,
 <th class="rounds-col gtSmartphone-only">Rounds</th>,
 <th class="kdDiffCol">K-D Diff</th>,
 <th class="kdCol">K/D</th>,
 <th class="ratingCol">Rating (1.0)</th>]

In [154]:
list=[]
for tr in details:
    #name
    name= tr.find_all('td', attrs={"class": "playerCol"})[0].find("a").text
    #teams
    teams= []
    team_td= tr.find_all('td', attrs={"class": "teamCol"})[0].find_all("a")
    for a in team_td:
        teams.append(a.find("img").get("title"))
    #maps played
    no_of_maps= tr.find_all('td', attrs={"class": "statsDetail"})[0].text
    #rounds played
    no_of_rounds= tr.find_all('td', attrs={"class": "gtSmartphone-only"})[0].text
    #kd difference
    kd_difference= tr.find_all('td', attrs={"class": "kdDiffCol"})[0].text
    #k/d
    kd= tr.find_all('td', attrs={"class": "statsDetail"})[2].text
    #HLTV Rating
    hltv_rating= tr.find_all('td', attrs={"class": "ratingCol"})[0].text
    #creating element
    element={
        # define the dictionary
        'Name': name,
        'Teams': teams,
        'Maps': no_of_maps,
        'Rounds': no_of_rounds,
        'K-D Diff': kd_difference,
        'K/D': kd,
        'Rating (1.0)': hltv_rating
    }
    # print(element)
    #appending element
    list.append(element)
    # print (list)

In [164]:
# Convert List of Dictionaries to DataFrame
df = pd.DataFrame(list)

In [156]:
# Convert the HLTV Rating to a float
df['Rating (1.0)'] = df['Rating (1.0)'].astype(float)

In [157]:
def prettier(list): # This is a function that will be used to format the dataframe
    return list[1:-1]

In [162]:
# Format the Teams Played In a list
df['Teams'] = df['Teams'].apply(lambda x: ','.join(x))
df['Teams'] = df['Teams'].str.split(',')

In [163]:
# Export the dataframe as a CSV/Excel file with index=False
# path should be "./"
df.to_csv("./Scrape_Data.csv", index=False)