# Web Scrapping from a website:

**1. Install all the libraries required using pip install**
* Follow Web_Scrapping_Documentation.docx

**2. Import libraries required to scrape a website**

In [1]:
#import requests and BeautifulSoup library
import requests
import bs4

**3. Connect to 'https://quotes.toscrape.com/' and get the html text**

In [14]:
res = requests.get('https://quotes.toscrape.com/')

In [16]:
#can check the contents of the response
#res.content 
#or 
#res.text

In [17]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [19]:
#BeautifulSoup converts it into a more readable format
#soup

**TASK1: Get the names of all the authors on the first page**

**4. Inpect the page or right click on any author name on the page and Inspect, find the tag specific for authors**

In [23]:
#use soup.select to get only the tags/details of author class, it results in a list of items
soup.select('.author')

[<small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Jane Austen</small>,
 <small class="author" itemprop="author">Marilyn Monroe</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">André Gide</small>,
 <small class="author" itemprop="author">Thomas A. Edison</small>,
 <small class="author" itemprop="author">Eleanor Roosevelt</small>,
 <small class="author" itemprop="author">Steve Martin</small>]

In [24]:
#get the first item in the list
soup.select('.author')[0]

<small class="author" itemprop="author">Albert Einstein</small>

In [27]:
#get the contents
soup.select('.author')[0].contents

['Albert Einstein']

In [34]:
#Final:
authors = []
for author in soup.select('.author'):
    authors.append(author.text)

In [35]:
authors

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

**TASK2: Create a list of all the quotes in the first page**

In [36]:
#Perform steps similar as above (step 4)
#find the tag associated with the quotes using Inspect
soup.select('.text')

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>,
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>,
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>,
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>,
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>,
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>,
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.

In [37]:
soup.select('.text')[0].text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [38]:
#Final:
quotes = []
for quote in soup.select('.text'):
    quotes.append(quote.text)

In [39]:
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

**TASK3: Inspect the site and use Beautiful Soup to extract the top ten tags from the requests text shown on the top right from the home page (e.g Love,Inspirational,Life, etc...).**

In [80]:
#find the specific tag element
soup.select('.tag-item')

[<span class="tag-item">
 <a class="tag" href="/tag/love/" style="font-size: 28px">love</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/inspirational/" style="font-size: 26px">inspirational</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/life/" style="font-size: 26px">life</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/humor/" style="font-size: 24px">humor</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/books/" style="font-size: 22px">books</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/reading/" style="font-size: 14px">reading</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friendship/" style="font-size: 10px">friendship</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friends/" style="font-size: 8px">friends</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/truth/" style="font-size: 8px">truth</a>
 </span>,
 <span class="tag-item">
 <a class="

In [81]:
result = soup.select('.tag-item')[0]

In [82]:
result

<span class="tag-item">
<a class="tag" href="/tag/love/" style="font-size: 28px">love</a>
</span>

In [83]:
result.text

'\nlove\n'

In [85]:
#Final:
for tag in soup.select('.tag-item'):
        print(tag.text)    


love


inspirational


life


humor


books


reading


friendship


friends


truth


simile



**TASK4: Loop through all the pages and get all the unique authors on the website.**

In [99]:
#frame the base url so that it can be used for multiple pages
#'https://quotes.toscrape.com/page/1/'
base_url = 'https://quotes.toscrape.com/page/{}/'

In [101]:
#Testing for page1
base_url.format(1)

'https://quotes.toscrape.com/page/1/'

In [103]:
#1.
res = requests.get(base_url.format(1))

In [106]:
#2. Convert it into BeautifulSoup
soup = bs4.BeautifulSoup(res.text,'lxml')

In [108]:
#soup

In [111]:
#3. Inspect the page to find tag for authors name
soup.select('.author')

[<small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Jane Austen</small>,
 <small class="author" itemprop="author">Marilyn Monroe</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">André Gide</small>,
 <small class="author" itemprop="author">Thomas A. Edison</small>,
 <small class="author" itemprop="author">Eleanor Roosevelt</small>,
 <small class="author" itemprop="author">Steve Martin</small>]

In [112]:
result = soup.select('.author')[0]

In [114]:
result.text

'Albert Einstein'

In [2]:
#Final:
base_url = 'https://quotes.toscrape.com/page/{}/'
authors = set()
n = 1
while True:
    res = requests.get(base_url.format(n))
    n = n+1
    
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    if len(soup.select('.author')) != 0:
        for author in soup.select('.author'):
            authors.add(author.text)
    else:
        break

In [3]:
authors

{'Albert Einstein',
 'Alexandre Dumas fils',
 'Alfred Tennyson',
 'Allen Saunders',
 'André Gide',
 'Ayn Rand',
 'Bob Marley',
 'C.S. Lewis',
 'Charles Bukowski',
 'Charles M. Schulz',
 'Douglas Adams',
 'Dr. Seuss',
 'E.E. Cummings',
 'Eleanor Roosevelt',
 'Elie Wiesel',
 'Ernest Hemingway',
 'Friedrich Nietzsche',
 'Garrison Keillor',
 'George Bernard Shaw',
 'George Carlin',
 'George Eliot',
 'George R.R. Martin',
 'Harper Lee',
 'Haruki Murakami',
 'Helen Keller',
 'J.D. Salinger',
 'J.K. Rowling',
 'J.M. Barrie',
 'J.R.R. Tolkien',
 'James Baldwin',
 'Jane Austen',
 'Jim Henson',
 'Jimi Hendrix',
 'John Lennon',
 'Jorge Luis Borges',
 'Khaled Hosseini',
 "Madeleine L'Engle",
 'Marilyn Monroe',
 'Mark Twain',
 'Martin Luther King Jr.',
 'Mother Teresa',
 'Pablo Neruda',
 'Ralph Waldo Emerson',
 'Stephenie Meyer',
 'Steve Martin',
 'Suzanne Collins',
 'Terry Pratchett',
 'Thomas A. Edison',
 'W.C. Fields',
 'William Nicholson'}

In [5]:
len(authors)

50