In [3]:
import requests
from lxml import html

In [15]:
URL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Europe"
page = requests.get(URL)

In [16]:
tree = html.fromstring(page.content)

xpath always returns a list of results, but there's only one, so we'll use that:

In [95]:
xpath_result = tree.xpath('/html/body/div[3]/div[3]/div[4]/div/table[2]')
table = xpath_result[0]

In [96]:
for elem in table:
    print(elem)

<Element tbody at 0x7f074bbf24f8>


Why is there only one element? I don't know and don't have the time to care, so I'll write
a helper function that always gives me an outline of the HTML (sub)tree that I'm
currently processing.

In [77]:
def print_outline(tree, indent=0):
    """print the outline of the given lxml.html tree"""
    indent_prefix = indent * '  '
    print(indent_prefix + '<' + tree.tag + '>')
    for elem in tree.iterchildren():
        print_outline(elem, indent=indent+1)

In [63]:
print_outline(table)

<table>
  <tbody>
    <tr>
      <th>
      <th>
      <th>
        <br>
        <sup>
          <a>
        <sup>
          <a>
        <sup>
          <a>
      <th>
        <br>
        <sup>
          <a>
        <sup>
          <a>
      <th>
        <br>
        <sup>
          <a>
        <sup>
          <a>
        <sup>
          <a>
      <th>
        <br>
        <sup>
          <a>
        <sup>
          <a>
      <th>
        <br>
        <sup>
          <a>
        <sup>
          <a>
    <tr>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <a>
        <sup>
          <a>
        <br>
        <br>
      <td>
        <a>
        <i>
      <td>
        <a>
        <br>
        <br>
        <a>
        <i>
      <td>
      <td>
        <span>
          <sup>
    <tr>
      <td>
        <div>
          <div>
            <a>
              <img>
      <t

        <i>
      <td>
      <td>
        <span>
          <sup>
    <tr>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <a>
        <br>
        <br>
      <td>
        <a>
        <i>
      <td>
        <a>
        <br>
        <br>
        <a>
        <i>
      <td>
      <td>
        <span>
          <sup>
    <tr>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <a>
      <td>
        <a>
        <i>
      <td>
        <a>
        <br>
        <br>
        <a>
        <i>
      <td>
      <td>
        <span>
          <sup>
    <tr>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <div>
          <div>
            <a>
              <img>
      <td>
        <a>
        <sup>
          <a>
 

It looks like everything we need is in the ``<tbody>``, so we'll grab that.

In [64]:
table.getchildren()

[<Element tbody at 0x7f074bbf24f8>]

In [65]:
tbody = table.getchildren()[0]

In [97]:
tbody

<Element tbody at 0x7f074bbf24f8>

In [102]:
for elem in tbody.getchildren():
    print(elem.tag, end=' ')

tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr tr 

There are only ``<tr>`` (rows) in here, it's probably the right place.
The first one is the header, the rest should be the countries:

In [103]:
rows = tbody.getchildren()

In [69]:
header = rows[0]
countries = rows[1:]

In [72]:
print(header.text_content())


Flag

Map

English common and formal names[18][19][20]

Domestic common and formal names[18][19]

Capital[20][21][22]

Population[a][23]

Area[a][24]



In [75]:
countries[0].text_content()

'\n\n\n\n\nAlbania[i]Republic of Albania\n\nAlbanian: Shqipëri / Shqipëria — Republika e Shqipërisë\n\nTiranaAlbanian: Tiranë\n\n2,887,000\n\n28,748\xa0km2 (11,100\xa0sq\xa0mi)\n'

The 3rd column contains the country's name, but also some other crap:

In [105]:
countries[0][2].text_content()

'Albania[i]Republic of Albania\n'

In [106]:
print_outline(countries[0][2])

<td>
  <a>
  <sup>
    <a>
  <br>
  <br>


We need to dig deeper, so let's look at the complete HTML of that column:

In [89]:
from lxml import etree

In [107]:
etree.tostring(countries[0][2])

b'<td><a href="/wiki/Albania" title="Albania">Albania</a><sup id="ref_i_1" class="reference"><a href="#cnote_i">[i]</a></sup><br/><br/>Republic of Albania\n</td>\n'

In [113]:
for country in countries:
    name_column = country[2]
    country_link = name_column.find('a') # get the first '<a>' subtree
    country_name = country_link.get('title') # get the 'title' attribute of the link
    print(country_name)

Albania
Andorra
Armenia
Austria
Azerbaijan
Belarus
Belgium
Bosnia and Herzegovina
Bulgaria
Croatia
Cyprus
Czech Republic
Denmark
Estonia
Finland
France
Georgia (country)
Germany
Greece
Hungary
Iceland
Republic of Ireland
Italy
Kazakhstan
Latvia
Liechtenstein
Lithuania
Luxembourg
Malta
Moldova
Monaco
Montenegro
Netherlands
Republic of North Macedonia
Norway
Poland
Portugal
Romania
Russia
San Marino
Serbia
Slovakia
Slovenia
Spain
Sweden
Switzerland
Turkey
Ukraine
United Kingdom
Vatican City
