In [1]:
import requests
from bs4 import BeautifulSoup


## Get content from webpage with requests

In [2]:
content = requests.get("https://pythonhow.com/example.html").content

content


b'<!DOCTYPE html>\n<html>\n<head>\n<style>\ndiv.cities {\n    background-color:black;\n    color:white;\n    margin:20px;\n    padding:20px;\n} \n</style>\n</head>\n<body>\n<h1 align="center"> Here are three big cities </h1>\n<div class="cities">\n<h2>London</h2>\n<p>London is the capital of England and it\'s been a British settlement since 2000 years ago. </p>\n</div>\n<div class="cities">\n<h2>Paris</h2>\n<p>Paris is the capital city of France. It was declared capital since 508.</p>\n</div>\n<div class="cities">\n<h2>Tokyo</h2>\n<p>Tokyo is the capital of Japan and one of the most populated cities in the world.</p>\n</div>\n</body>\n</html>'

## Put content into BeautifulSoup object for parsing

In [3]:
soup = BeautifulSoup(content, "html.parser")

soup


<!DOCTYPE html>

<html>
<head>
<style>
div.cities {
    background-color:black;
    color:white;
    margin:20px;
    padding:20px;
} 
</style>
</head>
<body>
<h1 align="center"> Here are three big cities </h1>
<div class="cities">
<h2>London</h2>
<p>London is the capital of England and it's been a British settlement since 2000 years ago. </p>
</div>
<div class="cities">
<h2>Paris</h2>
<p>Paris is the capital city of France. It was declared capital since 508.</p>
</div>
<div class="cities">
<h2>Tokyo</h2>
<p>Tokyo is the capital of Japan and one of the most populated cities in the world.</p>
</div>
</body>
</html>

## Find all div sections with class type 'cities'

Through inspection, we see that city information is present in div sections with class = 'cities'.  So, find this div sections.

In [4]:
city_divs = soup.find_all('div', {'class':'cities'})

city_divs

[<div class="cities">
 <h2>London</h2>
 <p>London is the capital of England and it's been a British settlement since 2000 years ago. </p>
 </div>, <div class="cities">
 <h2>Paris</h2>
 <p>Paris is the capital city of France. It was declared capital since 508.</p>
 </div>, <div class="cities">
 <h2>Tokyo</h2>
 <p>Tokyo is the capital of Japan and one of the most populated cities in the world.</p>
 </div>]

## Find names (h2 tags) and info (p tags) within each of those div sections
Through inspection, we see that city names are heading2 (h2) elements and city info are paragraph (p) elements.  So, find these elements within the div sections we have already extracted.

In [5]:
city_names = [city.find('h2').text for city in city_divs]

city_names


['London', 'Paris', 'Tokyo']

In [6]:
city_info = [city.find('p').text for city in city_divs]

city_info


["London is the capital of England and it's been a British settlement since 2000 years ago. ",
 'Paris is the capital city of France. It was declared capital since 508.',
 'Tokyo is the capital of Japan and one of the most populated cities in the world.']

## Now that we have the data, could work with it using...

### ..tuples list...

In [7]:
city_tuples = zip(city_names, city_info)

for item in city_tuples:
    print(item)
    

('London', "London is the capital of England and it's been a British settlement since 2000 years ago. ")
('Paris', 'Paris is the capital city of France. It was declared capital since 508.')
('Tokyo', 'Tokyo is the capital of Japan and one of the most populated cities in the world.')


### ...or dictionary...

In [8]:
city_dict = dict()

for city in city_divs:
    city_dict[city.find('h2').text] = city.find('p').text

city_dict


{'London': "London is the capital of England and it's been a British settlement since 2000 years ago. ",
 'Paris': 'Paris is the capital city of France. It was declared capital since 508.',
 'Tokyo': 'Tokyo is the capital of Japan and one of the most populated cities in the world.'}

### ...or Pandas dataframe...

In [9]:
import pandas

df = pandas.DataFrame([])
df.index.name = 'Index'

df['City Name'] = [city.find('h2').text for city in city_divs]
df['City Info'] = [city.find('p').text for city in city_divs]

df


Unnamed: 0,City Name,City Info
0,London,London is the capital of England and it's been...
1,Paris,Paris is the capital city of France. It was de...
2,Tokyo,Tokyo is the capital of Japan and one of the m...


## Send results to csv file

In [10]:
df.to_csv('cities.csv')
