In [3]:
import lxml.html #Scraping
import requests #Get HTTP requests

Let's scrape Steam, and open the html page using the request library

Now let’s open up the web page using requests and pass that response to lxml.html.fromstring.

In [4]:
html = requests.get('https://store.steampowered.com/explore/new/')

doc = lxml.html.fromstring(html.content) #Provides an object of htmlElement type

In [5]:
new_releases = doc.xpath('//div[@id="tab_newreleases_content"]')[0]

#returns all the divs in the HTML page which have an id of tab_newrelases_content
#//div[@id="tab_newreleases_cotent"]
#Take first element since only one div on the page has the id.

// these double forward slashes tell lxml that we want to search for all tags in the HTML document which match our requirements/filters. Another option was to use / (a single forward slash). The single forward slash returns only the immediate child tags/nodes which match our requirements/filters

div tells lxml that we are searching for divs in the HTML page


[@id="tab_newreleases_content"] tells lxml that we are only interested in those divs which have an id of tab_newreleases_content

**Extract the titles and prices**

In [12]:
titles = new_releases.xpath('.//div[@class="tab_item_name"]/text()')
titles

['Onimusha: Warlords / 鬼武者',
 'Trickster VR: Co-op Dungeon Crawler',
 'My Time At Portia',
 'RESIDENT EVIL 2 / BIOHAZARD RE:2 "1-Shot Demo"',
 'Love Language Japanese',
 'What Never Was',
 'Tales of Vesperia: Definitive Edition',
 'Catherine Classic',
 'Medieval Kingdom Wars',
 'FrostRunner',
 'ATOM RPG: Post-apocalyptic indie game',
 'Garden Paws',
 'GRIS',
 'Insurgency: Sandstorm',
 "Tom Clancy's Rainbow Six® Siege - Year 4 Pass",
 'Katamari Damacy REROLL',
 'Kenshi',
 'Mutant Year Zero: Road to Eden',
 'Parkitect',
 'Equilinox',
 'Farming Simulator 19',
 'Wallpaper Engine',
 'Warhammer 40,000: Mechanicus',
 'HITMAN™ 2',
 'Thronebreaker: The Witcher Tales',
 'Thief Simulator',
 'Supply Chain Idle',
 'One Hour One Life',
 'Football Manager 2019',
 'Return of the Obra Dinn',
 'RimWorld',
 'The Jackbox Party Pack 5',
 "Assassin's Creed® Odyssey",
 'Creed: Rise to Glory™',
 'CrossCode',
 'Pummel Party']

. tells lxml that we are only interested in the tags which are the children of the new_releases tag

[@class="tab_item_name"] is pretty similar to how we were filtering divs based on id. The only difference is that here we are filtering based on the class name

/text() tells lxml that we want the text contained within the tag we just extracted. In this case, it returns the title contained in the div with the tab_item_name class nam

In [15]:
prices = new_releases.xpath('.//div[@class="discount_final_price"]/text()')
prices

['$19.99',
 '$11.04',
 '$29.99',
 'Free',
 '$15.99',
 'Free',
 '$49.99',
 '$19.99',
 '$15.99',
 'Free To Play',
 '$14.99',
 '$19.99',
 '$16.99',
 '$29.99',
 '$29.99',
 '$29.99',
 '$29.99',
 '$34.99',
 '$29.99',
 '$9.99',
 '$34.99',
 '$3.99',
 '$29.99',
 '$59.99',
 '$29.99',
 '$14.99',
 'Free To Play',
 '$19.99',
 '$49.99',
 '$19.99',
 '$34.99',
 '$29.99',
 '$59.99',
 '$29.99',
 '$19.99',
 '$9.89']

In [25]:
tags = new_releases.xpath('.//div[@class="tab_item_top_tags"]')
total_tags = []
for tag in tags:
    total_tags.append(tag.text_content()) #text_content returns the text in HTML tag

In [29]:
tags = [tag.split(",") for tag in total_tags]
tags

[['Action', ' Violent', ' Hack and Slash', ' Classic'],
 ['Adventure', ' Co-op', ' Action', ' VR'],
 ['Open World', ' Crafting', ' Simulation', ' RPG'],
 ['Gore', ' Action', ' Violent', ' Zombies'],
 ['Nudity', ' Sexual Content', ' Casual', ' Indie'],
 ['Free to Play', ' Adventure', ' Indie', ' Puzzle'],
 ['RPG', ' Anime', ' JRPG', ' Action RPG'],
 ['Sexual Content', ' Anime', ' Puzzle', ' Nudity'],
 ['Strategy', ' Medieval', ' RTS', ' Simulation'],
 ['Free to Play', ' Indie', ' Action', ' Parkour'],
 ['RPG', ' Post-apocalyptic', ' Turn-Based Combat', ' Isometric'],
 ['Adventure', ' Simulation', ' Indie', ' RPG'],
 ['Atmospheric', ' Indie', ' Great Soundtrack', ' Adventure'],
 ['FPS', ' Realistic', ' Shooter', ' Multiplayer'],
 ['Action', ' FPS', ' Multiplayer', ' Tactical'],
 ['Casual', ' Action', ' Great Soundtrack', ' Puzzle'],
 ['Open World', ' Sandbox', ' RPG', ' Survival'],
 ['Strategy', ' RPG', ' Adventure', ' Turn-Based Combat'],
 ['Management', ' Simulation', ' Building', ' Sa

**Extract the Platform: How to scrape spans**

In [39]:
platforms_div = new_releases.xpath('.//div[@class="tab_item_details"]')
total_platforms = []

for game in platforms_div:
    temp = game.xpath('.//span[contains(@class, "platform_img")]')
    platforms = [t.get('class').split(' ')[-1] for t in temp]
    if 'hmd_separator' in platforms:
        platforms.remove('hmd_separator')
    total_platforms.append(platforms)

In [40]:
output = []
for info in zip(titles,prices, tags, total_platforms):
    resp = {}
    resp['title'] = info[0]
    resp['price'] = info[1]
    resp['tags'] = info[2]
    resp['platforms'] = info[3]
    output.append(resp)

In [42]:
output

[{'title': 'Onimusha: Warlords / 鬼武者',
  'price': '$19.99',
  'tags': ['Action', ' Violent', ' Hack and Slash', ' Classic'],
  'platforms': ['win']},
 {'title': 'Trickster VR: Co-op Dungeon Crawler',
  'price': '$11.04',
  'tags': ['Adventure', ' Co-op', ' Action', ' VR'],
  'platforms': ['win', 'htcvive', 'oculusrift', 'windowsmr']},
 {'title': 'My Time At Portia',
  'price': '$29.99',
  'tags': ['Open World', ' Crafting', ' Simulation', ' RPG'],
  'platforms': ['win']},
 {'title': 'RESIDENT EVIL 2 / BIOHAZARD RE:2 "1-Shot Demo"',
  'price': 'Free',
  'tags': ['Gore', ' Action', ' Violent', ' Zombies'],
  'platforms': ['win']},
 {'title': 'Love Language Japanese',
  'price': '$15.99',
  'tags': ['Nudity', ' Sexual Content', ' Casual', ' Indie'],
  'platforms': ['win']},
 {'title': 'What Never Was',
  'price': 'Free',
  'tags': ['Free to Play', ' Adventure', ' Indie', ' Puzzle'],
  'platforms': ['win']},
 {'title': 'Tales of Vesperia: Definitive Edition',
  'price': '$49.99',
  'tags':