# Web Scraping of BlockChain Whitepapers

### Using the following libraries for data extraction from the webpage https://www.allcryptowhitepapers.com/whitepaper-overview/

* Requests
* BeautifulSoup4
* Selenium (Optional)

In [1]:
#Loading Libraries

import re
import time
import urllib
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://www.allcryptowhitepapers.com/whitepaper-overview"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page,"html.parser")

In [3]:
#Extract title of webpage
print(soup.title.text)

All Crypto Whitepapers | Your #1 source for cryptocurrency research


In [4]:
#Extract the table from containing links to the white papers

table = soup.find("table")

table_rows = table.find_all('td')

print(table_rows)

[<td class="column-1">Bitcoin</td>, <td class="column-2"><a href="https://www.allcryptowhitepapers.com/Bitcoin-Whitepaper" target='_blank"'>Whitepaper</a></td>, <td class="column-3"></td>, <td class="column-1">Ethereum</td>, <td class="column-2"><a href="https://www.allcryptowhitepapers.com/Ethereum-Whitepaper" target='_blank"'>Whitepaper</a></td>, <td class="column-3"></td>, <td class="column-1">Ripple - XRP</td>, <td class="column-2"><a href="https://www.allcryptowhitepapers.com/Ripple-Whitepaper" target='_blank"'>Whitepaper</a></td>, <td class="column-3"></td>, <td class="column-1">Litecoin</td>, <td class="column-2"><a href="https://www.allcryptowhitepapers.com/Litecoin-Whitepaper" target='_blank"'>Whitepaper</a></td>, <td class="column-3"></td>, <td class="column-1">Bitcoin Cash</td>, <td class="column-2"><a href="https://www.allcryptowhitepapers.com/BitcoinCash-Whitepaper" target='_blank"'>Whitepaper</a></td>, <td class="column-3"></td>, <td class="column-1">EOS</td>, <td class="

In [5]:
#Extract links from table
links = table.find_all('a')

whitepaper_urls = []
for link in links:
    url = link.get('href')
    whitepaper_urls.append(url)

In [6]:
print(whitepaper_urls)

['https://www.allcryptowhitepapers.com/Bitcoin-Whitepaper', 'https://www.allcryptowhitepapers.com/Ethereum-Whitepaper', 'https://www.allcryptowhitepapers.com/Ripple-Whitepaper', 'https://www.allcryptowhitepapers.com/Litecoin-Whitepaper', 'https://www.allcryptowhitepapers.com/BitcoinCash-Whitepaper', 'https://www.allcryptowhitepapers.com/EOS-Whitepaper', 'https://www.allcryptowhitepapers.com/BinanceCoin-Whitepaper', 'https://www.allcryptowhitepapers.com/Tether-Whitepaper', 'https://www.allcryptowhitepapers.com/Bitcoin-Cash-SV-whitepaper/', 'https://www.allcryptowhitepapers.com/TRON-Whitepaper', 'https://www.allcryptowhitepapers.com/Stellar-Whitepaper', 'https://www.allcryptowhitepapers.com/Cardano-Whitepaper', 'https://www.allcryptowhitepapers.com/Monero-Whitepaper', 'https://www.allcryptowhitepapers.com/unus-sed-leo-whitepaper/', 'https://www.allcryptowhitepapers.com/Dash-Whitepaper', 'https://www.allcryptowhitepapers.com/NEO-Whitepaper', 'https://www.allcryptowhitepapers.com/ChainLink

In [7]:
df1 = pd.DataFrame(whitepaper_urls,columns=['Whitepaper_URLs'])
print(df1)

                                        Whitepaper_URLs
0     https://www.allcryptowhitepapers.com/Bitcoin-W...
1     https://www.allcryptowhitepapers.com/Ethereum-...
2     https://www.allcryptowhitepapers.com/Ripple-Wh...
3     https://www.allcryptowhitepapers.com/Litecoin-...
4     https://www.allcryptowhitepapers.com/BitcoinCa...
5     https://www.allcryptowhitepapers.com/EOS-White...
6     https://www.allcryptowhitepapers.com/BinanceCo...
7     https://www.allcryptowhitepapers.com/Tether-Wh...
8     https://www.allcryptowhitepapers.com/Bitcoin-C...
9     https://www.allcryptowhitepapers.com/TRON-Whit...
10    https://www.allcryptowhitepapers.com/Stellar-W...
11    https://www.allcryptowhitepapers.com/Cardano-W...
12    https://www.allcryptowhitepapers.com/Monero-Wh...
13    https://www.allcryptowhitepapers.com/unus-sed-...
14    https://www.allcryptowhitepapers.com/Dash-Whit...
15    https://www.allcryptowhitepapers.com/NEO-White...
16    https://www.allcryptowhitepapers.com/Chain

In [8]:
#Extract titles
whitepaper_titles = soup.find_all('td',attrs={'class':'column-1'})
print(whitepaper_titles)

[<td class="column-1">Bitcoin</td>, <td class="column-1">Ethereum</td>, <td class="column-1">Ripple - XRP</td>, <td class="column-1">Litecoin</td>, <td class="column-1">Bitcoin Cash</td>, <td class="column-1">EOS</td>, <td class="column-1">Binance Coin</td>, <td class="column-1">Tether</td>, <td class="column-1">Bitcoin Cash SV </td>, <td class="column-1">TRON</td>, <td class="column-1">Stellar</td>, <td class="column-1">Cardano</td>, <td class="column-1">Monero</td>, <td class="column-1">Unus Sed Leo</td>, <td class="column-1">Dash</td>, <td class="column-1">NEO</td>, <td class="column-1">ChainLink</td>, <td class="column-1">IOTA</td>, <td class="column-1">Cosmos</td>, <td class="column-1">Tezos (Pre-Launch)</td>, <td class="column-1">Ethereum Classic</td>, <td class="column-1">Crypto.com Chain</td>, <td class="column-1">Maker</td>, <td class="column-1">Ontology</td>, <td class="column-1">NEM</td>, <td class="column-1">Basic Attention Token</td>, <td class="column-1">Zcash</td>, <td c

In [9]:
type(whitepaper_titles)

bs4.element.ResultSet

In [10]:
str_titles = str(whitepaper_titles)
crypto_lists = BeautifulSoup(str_titles,"lxml").get_text()
print(crypto_lists)

type(crypto_lists)

[Bitcoin, Ethereum, Ripple - XRP, Litecoin, Bitcoin Cash, EOS, Binance Coin, Tether, Bitcoin Cash SV , TRON, Stellar, Cardano, Monero, Unus Sed Leo, Dash, NEO, ChainLink, IOTA, Cosmos, Tezos (Pre-Launch), Ethereum Classic, Crypto.com Chain, Maker, Ontology, NEM, Basic Attention Token, Zcash, Bitcoin Gold, VeChain, Dogecoin, 0chain, 0x, 0xBitcoin, 0xcert, 1irstcoin, 1SG, 1World, 1x2 Coin, 2GIVE, 300 Token, 42-coin, 4NEW, 808Coin, 8Bit, AB-Chain RTB, ABCC Token, Abjcoin, ABLE, Absolute, Abulaba, AC3, Accelerator Network, ACChain, Ace, ACE (TokenStars), AceD, Aces, Achain, Acoin, ACRE, Actinium, Acute Angle Cloud, Adamant Messenger, adbank, AdCoin, Adenz, AdEx, AdHive, Aditus, Adrenaline, AdShares, adToken, AdultChain, Advanced Internet Blocks, Advanced Technology Coin, Adzcoin, Aegeus, aelf, AENCoin, Aeon, AERGO, Aerium, Aeron, Aeternity, AgaveCoin, Agrello, AgrolifeCoin, Agrolot, AI Doctor, AICHAIN, AidCoin, Aidos Kuneen, Aidus, Aigang, AiLink Token, Aion, Airbloc, Airline & Life Networ

str

In [11]:
whitepaper_titles =crypto_lists.split(",")

In [12]:
df = pd.DataFrame(whitepaper_titles,columns=['Cryptocurrency'])
print(df)

              Cryptocurrency
0                   [Bitcoin
1                   Ethereum
2               Ripple - XRP
3                   Litecoin
4               Bitcoin Cash
5                        EOS
6               Binance Coin
7                     Tether
8           Bitcoin Cash SV 
9                       TRON
10                   Stellar
11                   Cardano
12                    Monero
13              Unus Sed Leo
14                      Dash
15                       NEO
16                 ChainLink
17                      IOTA
18                    Cosmos
19        Tezos (Pre-Launch)
20          Ethereum Classic
21          Crypto.com Chain
22                     Maker
23                  Ontology
24                       NEM
25     Basic Attention Token
26                     Zcash
27              Bitcoin Gold
28                   VeChain
29                  Dogecoin
...                      ...
2796                 ZenCash
2797                 ZenGold
2798          

In [13]:
import numpy as np

In [14]:
whitepaper_data = pd.concat([df,df1],sort=True,axis=1)
whitepaper_data

Unnamed: 0,Cryptocurrency,Whitepaper_URLs
0,[Bitcoin,https://www.allcryptowhitepapers.com/Bitcoin-W...
1,Ethereum,https://www.allcryptowhitepapers.com/Ethereum-...
2,Ripple - XRP,https://www.allcryptowhitepapers.com/Ripple-Wh...
3,Litecoin,https://www.allcryptowhitepapers.com/Litecoin-...
4,Bitcoin Cash,https://www.allcryptowhitepapers.com/BitcoinCa...
5,EOS,https://www.allcryptowhitepapers.com/EOS-White...
6,Binance Coin,https://www.allcryptowhitepapers.com/BinanceCo...
7,Tether,https://www.allcryptowhitepapers.com/Tether-Wh...
8,Bitcoin Cash SV,https://www.allcryptowhitepapers.com/Bitcoin-C...
9,TRON,https://www.allcryptowhitepapers.com/TRON-Whit...


In [15]:
len(whitepaper_data)

2826