In [1]:
# https://github.com/realpython/list-of-python-api-wrappers

from bs4 import BeautifulSoup as bs
import requests

html = requests.get("http://automationpractice.com/index.php").text
soup = bs(html, 'html5lib')

first_paragraph = soup.p
# first_paragraph = soup.find('p')

para_text = soup.p.text
para_words = soup.p.text.split()

print('Text:', para_text)
print('Words:', para_words)

# first_para_id = soup.p['id'] # raises key error if no id
first_para_id = soup.p.get('id') # returns None if no id

print(first_para_id)

Text: 
							No products
						
Words: ['No', 'products']
None


In [2]:
# multiple tags at once
all_paras = soup.find_all('p') # or just soup('p')
print(all_paras[:5])
paras_with_ids = [p.text for p in soup('p') if p.get('id')] # only paras which has id attribute
print(paras_with_ids)

[<p class="cart_block_no_products">
							No products
						</p>, <p class="cart-buttons">
							<a class="btn btn-default button button-small" href="http://automationpractice.com/index.php?controller=order" id="button_order_cart" rel="nofollow" title="Check out">
								<span>
									Check out<i class="icon-chevron-right right"></i>
								</span>
							</a>
						</p>, <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin tristique in tortor et dignissim. Quisque non tempor leo. Maecenas egestas sem elit</p>, <p><button class="btn btn-default" type="button">Shop now !</button></p>, <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin tristique in tortor et dignissim. Quisque non tempor leo. Maecenas egestas sem elit</p>]
['Subsidiary of seleniumframework.com']


In [3]:
spantexts_inside_divs = [span.text for div in soup('div') for span in div('span')] 
print(spantexts_inside_divs[:10])

['\n\t\tCall us now: 0123-456-789\n\t', 'Search', '0', 'Product', 'Products', '\n\t\t\t\t\t\t\t', '(empty)', '\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFree shipping!\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\tShipping\n\t\t\t\t\t\t\t\t', '$0.00']


In [9]:
header_user_info = soup.findAll('div', 'header_user_info') # basic is class
print(len(header_user_info))
print(header_user_info)

example_id = soup.find_all('div', {'id': 'contact-link'})
print(example_id)

1
[<div class="header_user_info">
			<a class="login" href="http://automationpractice.com/index.php?controller=my-account" rel="nofollow" title="Log in to your customer account">
			Sign in
		</a>
	</div>]
[<div id="contact-link">
	<a href="http://automationpractice.com/index.php?controller=contact" title="Contact Us">Contact us</a>
</div>]


In [36]:
# parsing JSON and XML

# JSON
import json

serialized = """{ "title" : "Data Science Book",
            "author" : "Joel Grus",
            "publicationYear" : 2014,
            "topics" : [ "data", "science", "data science"] }"""

print('JSON')
print('Serialized:', serialized)

deserialized = json.loads(serialized)

if "data science" in deserialized["topics"]:
    print(deserialized)

# XML
import lxml

content = []

with open('./files/sample.xml', 'r') as f:
    content = f.readlines()
    content = "".join(content)
    bs_content = bs(content, 'lxml')

print('---------------------------------------------\nXML')
print(bs_content.find_all('topic', {'class': 'data-science'}))
print(bs_content.find_all('topic'))
print(bs_content.book.author)

JSON
Serialized: { "title" : "Data Science Book",
            "author" : "Joel Grus",
            "publicationYear" : 2014,
            "topics" : [ "data", "science", "data science"] }
{'title': 'Data Science Book', 'author': 'Joel Grus', 'publicationYear': 2014, 'topics': ['data', 'science', 'data science']}
---------------------------------------------
XML
[<topic class="data-science">data</topic>, <topic class="data-science">data science</topic>]
[<topic class="data-science">data</topic>, <topic>science</topic>, <topic class="data-science">data science</topic>]
<author>Joel Grus</author>


In [52]:
# Using an unauthenticated api
# example: github
from collections import Counter

import requests, json
from dateutil.parser import parse
endpoint = "https://api.github.com/users/yunuskaratepe/repos"

repos = json.loads(requests.get(endpoint).text)
# print(repos[0])
print('Dictionary Keys')
print(repos[0].keys())
print('------------------------------------------------------\nDates')
dates = [parse(repo["created_at"]) for repo in repos]
print(dates)
print('------------------------------------------------------')
month_counts = Counter([date.month for date in dates])
print(month_counts)
year_counts = Counter([date.year for date in dates])
print(year_counts)
weekday_counts = Counter([date.weekday() for date in dates])
print(weekday_counts)

Dictionary Keys
dict_keys(['id', 'node_id', 'name', 'full_name', 'private', 'owner', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'forks_count', 'mirror_url', 'archived', 'disabled', 'open_iss