# Solution: WEB SCRAPING ASSIGNMENT 4

In [1]:
#importing libraries
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

### Q1. Scrape the details of most viewed videos on YouTube from Wikipedia:
#### Url = https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos
**You need to find following details:**

**A) Rank**

**B) Name**

**C) Artist**

**D) Upload date**

**E) Views**

In [2]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos
driver.get('https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos')

#initializing most viewed videos on YouTube container
views_container = WebDriverWait(driver,20).until(ec.visibility_of_element_located((By.XPATH,'//table[.//caption[contains(text(),"most-viewed")]]')))

#Scraping required details
ranks = []
names = []
artists = []
upload_dates = []
views = []
record_rows = views_container.find_elements_by_xpath('.//tbody/tr')
for row in record_rows:
    cols = row.find_elements_by_xpath('.//td')
    ranks.append(re.search('\d+',cols[0].text).group(0))
    names.append(re.search('"(.*?)"',cols[1].text).group(1))
    artists.append(cols[2].text)
    upload_dates.append(cols[4].text)
    views.append(cols[3].text)
    

#closing driver
driver.close()

#storing scraped details into dataframe
df_most_viewed_yt_videos = pd.DataFrame({
    "Rank": ranks,
    "Name": names,
    "Artist": artists,
    "Upload Date": upload_dates,
    "Views (billions)": views,
})

#Interpreting scraped details
df_most_viewed_yt_videos

Unnamed: 0,Rank,Name,Artist,Upload Date,Views (billions)
0,1,Baby Shark Dance,Pinkfong Kids' Songs & Stories,"June 17, 2016",8.44
1,2,Despacito,Luis Fonsi,"January 12, 2017",7.32
2,3,Shape of You,Ed Sheeran,"January 30, 2017",5.29
3,4,Johny Johny Yes Papa,LooLoo Kids,"October 8, 2016",5.24
4,5,See You Again,Wiz Khalifa,"April 6, 2015",5.08
5,6,Masha and the Bear – Recipe for Disaster,Get Movies,"January 31, 2012",4.43
6,7,Uptown Funk,Mark Ronson,"November 19, 2014",4.16
7,8,Gangnam Style,Psy,"July 15, 2012",4.05
8,9,Learning Colors – Colorful Eggs on a Farm,Miroshka TV,"February 27, 2018",3.97
9,10,Bath Song,Cocomelon – Nursery Rhymes,"May 2, 2018",3.95


### Q2. Scrape the details team India’s international fixtures from bcci.tv.
#### Url = https://www.bcci.tv/.
**You need to find following details:**

**A) Match title (I.e. 1st ODI)**

**B) Series**

**C) Place**

**D) Date**

**E) Time**

***Note: - From bcci.tv home page you have reach to the international fixture page through code.***

In [3]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.bcci.tv/
driver.get('https://www.bcci.tv')

#Clicking on menu International->Fixtures
international = WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//li[@data-nav-index="0"]')))
international.click()
international.find_element_by_xpath('.//a[contains(text(),"Fixtures")]').click()

#Intializing infomation container which needs to be scraped
fixture_containers = WebDriverWait(driver,10).until(ec.visibility_of_all_elements_located((By.XPATH,'//div[@class="js-list"]/a')))

#scraping required details
match_titles = []
serieses = []
places = []
dates = []
times = []

for fixture in fixture_containers:
    #scraping match title
    title = fixture.find_element_by_xpath('.//p[@class="fixture__additional-info"]/strong')
    match_titles.append(title.text)
    
    #scraping series
    series = fixture.find_element_by_xpath('.//div[@class="fixture__format-strip"]')
    serieses.append(series.text.replace('\n',' - '))
    
    #scraping place
    place = fixture.find_element_by_xpath('.//p[@class="fixture__additional-info"]/span')
    places.append(place.text)
    
    #scraping date
    day = fixture.find_element_by_xpath('.//span[@class="fixture__day"]')
    month = fixture.find_element_by_xpath('.//span[@class="fixture__month"]')
    date = fixture.find_element_by_xpath('.//span[@class="fixture__date"]')
    dt = f'{day.text}, {date.text} {month.text}'
    dates.append(dt)
    
    #scraping time
    time = fixture.find_element_by_xpath('.//span[@class="fixture__time"]')
    times.append(time.text)
    

#closing driver
driver.close()

#storing scraped data into dataframe
df_fixtures_international = pd.DataFrame({
    "Match Title": match_titles,
    "Series": serieses,
    "Place": places,
    "Date": dates,
    "Time": times,
})

#Interpreting scraped data
df_fixtures_international

Unnamed: 0,Match Title,Series,Place,Date,Time
0,Final,TEST - ICC WORLD TEST CHAMPIONSHIP FINAL,"The Ageas Bowl, Southampton","Friday, 18 JUNE",15:30 IST
1,1st Test,TEST - ENGLAND V INDIA 2021,"Trent Bridge, Nottingham","Wednesday, 04 AUGUST",15:30 IST
2,2nd Test,TEST - ENGLAND V INDIA 2021,"Lord's, London","Thursday, 12 AUGUST",15:30 IST
3,3rd Test,TEST - ENGLAND V INDIA 2021,"Headingley, Leeds","Wednesday, 25 AUGUST",15:30 IST
4,4th Test,TEST - ENGLAND V INDIA 2021,"The Oval, London","Thursday, 02 SEPTEMBER",15:30 IST
5,5th Test,TEST - ENGLAND V INDIA 2021,"Old Trafford, Manchester","Friday, 10 SEPTEMBER",15:30 IST


### Q3. Scrape the details of selenium exception from guru99.com.
#### Url = https://www.guru99.com/
**You need to find following details:**

**A) Name**

**B) Description**

***Note: - From guru99 home page you have to reach to selenium exception handling page through code.***

In [4]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.guru99.com
driver.get('https://www.guru99.com')

#clicking on selenium exception handling page ie. testing->selenium->selenium exception handling
#WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//span[@class="g-menu-item-title" and contains(text(),"Testing")]'))).click()
elem = driver.find_element_by_xpath('//nav[@class="g-main-nav"]/ul/li[2]')
ActionChains(driver).move_to_element(elem).perform()
WebDriverWait(driver,20).until(ec.element_to_be_clickable((By.XPATH,'//span[@class="g-menu-item-title" and contains(text(),"Selenium")]'))).click()

#searching for selenium exception handling and clicking on it
WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//td[contains(text(),"Selenium Exception Handling")]/preceding-sibling::td/a'))).click()

#scraping required details
names = []
descriptions = []

exception_containers = driver.find_elements_by_xpath('//table[@class="table table-striped"]/tbody/tr')

for exception in exception_containers[1:]:
    cols = exception.find_elements_by_xpath('.//td')
    names.append(cols[0].text)
    descriptions.append(cols[1].text)
    

#closing driver
driver.close()

#storing records into dataframe
df_exceptions = pd.DataFrame({
    "Name": names,
    "Description": descriptions,
})

#Interpreting exceptions details
df_exceptions


Unnamed: 0,Name,Description
0,ElementNotVisibleException,This type of Selenium exception occurs when an...
1,ElementNotSelectableException,This Selenium exception occurs when an element...
2,NoSuchElementException,This Exception occurs if an element could not ...
3,NoSuchFrameException,This Exception occurs if the frame target to b...
4,NoAlertPresentException,This Exception occurs when you switch to no pr...
5,NoSuchWindowException,This Exception occurs if the window target to ...
6,StaleElementReferenceException,This Selenium exception occurs happens when th...
7,SessionNotFoundException,The WebDriver is acting after you quit the bro...
8,TimeoutException,Thrown when there is not enough time for a com...
9,WebDriverException,This Exception takes place when the WebDriver ...


### Q4. Scrape the details of State-wise GDP of India from statisticstime.com.
#### Url = http://statisticstimes.com/
**You have to find following details:**

**A) Rank**

**B) State**

**C) GSDP(18-19)**

**D) GSDP(17-18)**

**E) Share(2017)**

**F) GDP($ billion)**

***Note: - From statisticstimes home page you have to reach to economy page through code.***

In [5]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url http://statisticstimes.com
driver.get('http://statisticstimes.com')

#clickig on economy -> india
WebDriverWait(driver,3).until(ec.element_to_be_clickable((By.XPATH,'//button[@class="dropbtn" and contains(text(),"Economy")]'))).click()
WebDriverWait(driver,3).until(ec.element_to_be_clickable((By.XPATH,'//button[@class="dropbtn" and contains(text(),"Economy")]/following-sibling::div/a[contains(text(),"India")]'))).click()

#manually close the ad pop-up

#clicking on link GDP of Indian States
WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//a[contains(text(),"GDP of Indian states")]'))).click()

#initializing information container
gdp_containers = driver.find_elements_by_xpath('//table[@id="table_id"]/tbody/tr')

#scraping required data
ranks = []
states = []
gsdp_19_20 = []
gsdp_18_19 = []
shares = []
gdp_billions = []

for gdp in gdp_containers:
    cols = gdp.find_elements_by_xpath('.//td')
    
    #scraping rank
    ranks.append(cols[0].text)
    
    #scraping state
    states.append(cols[1].text)
    
    #scraping GSDP (19-20)
    gsdp_19_20.append(cols[2].text)
    
    #scraping GSDP (18-19)
    gsdp_18_19.append(cols[3].text)
    
    #scraping Share
    shares.append(cols[4].text)
    
    #scraping GDP ($billion)
    gdp_billions.append(cols[5].text)
    
    
#closing driver
driver.close()

#storing scraped data into dataframe
df_gdp_indian_states = pd.DataFrame({
    "Rank": ranks,
    "State": states,
    "GSDP (19-20)": gsdp_19_20,
    "GSDP (18-19)": gsdp_18_19,
    "Share (18-19)": shares,
    "GDP ($billion)": gdp_billions,
})

#interpreting scraped gdp data
df_gdp_indian_states


Unnamed: 0,Rank,State,GSDP (19-20),GSDP (18-19),Share (18-19),GDP ($billion)
0,1,Maharashtra,-,2632792,13.88%,398.145
1,2,Tamil Nadu,1845853,1630208,8.59%,246.529
2,3,Uttar Pradesh,1687818,1584764,8.35%,239.656
3,4,Gujarat,-,1502899,7.92%,227.276
4,5,Karnataka,1631977,1493127,7.87%,225.798
5,6,West Bengal,1253832,1089898,5.75%,164.820
6,7,Rajasthan,1020989,942586,4.97%,142.543
7,8,Andhra Pradesh,972782,862957,4.55%,130.501
8,9,Telangana,969604,861031,4.54%,130.210
9,10,Madhya Pradesh,906672,809592,4.27%,122.431


### Q5. Scrape the details of trending repositories on Github.com.
#### Url = https://github.com/
**You have to find the following details:**
    
**A) Repository title**

**B) Repository description**

**C) Contributors count**

**D) Language used**

***Note: - From the home page you have to click on the trending option from Explore menu through code.***

In [6]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://github.com
driver.get('https://github.com')

#clicking on explore
explore = WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//summary[contains(text(),"Explore")]')))
ActionChains(driver).move_to_element(explore).perform() #hovering over explore menu
explore.click()
#clicking on trending
WebDriverWait(driver,5).until(ec.element_to_be_clickable((By.XPATH,'//li/a[contains(text(),"Trending")]'))).click()

#initializing trending repository containers
repository_containers = driver.find_elements_by_xpath('//article[@class="Box-row"]')

#storing original window handle
original_window = driver.current_window_handle

#scraping required details
repository_titles = []
repository_descriptions = []
contributor_counts = []
languages_used = []
for repo in repository_containers:
    #scraping repository title
    try:
        title = repo.find_element_by_xpath('.//h1/a')
        repository_titles.append(title.text)
    except NoSuchElementException:
        repository_titles.append('-')
        
    #scraping repository description
    try:
        description = repo.find_element_by_xpath('.//p')
        repository_descriptions.append(description.text)
    except NoSuchElementException:
        repository_descriptions.append('-')
        
    #scraping languages used
    try:
        language = repo.find_element_by_xpath('.//span[@itemprop="programmingLanguage"]')
        languages_used.append(language.text)
    except NoSuchElementException:
        languages_used.append('-')
        
    #scraping contributors count
    try:
        title.send_keys(Keys.CONTROL + Keys.ENTER)
        driver.switch_to.window(driver.window_handles[-1])
        contributor = WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//a[contains(text(),"Contributors")]/span[@class="Counter "]')))
        contributor_counts.append(contributor.text)
        
        driver.close()
        driver.switch_to.window(original_window)
    except TimeoutException:
        contributor_counts.append('-')
        driver.close()
        driver.switch_to.window(original_window)
        
        
#closing driver
driver.close()

#storing scraped data into dataframe
df_trending_repositories = pd.DataFrame({
    "Repository Title": repository_titles,
    "Repository Description": repository_descriptions,
    "Contributor Counts": contributor_counts,
    "Language Used": languages_used,
})

#interpreting scraped data
df_trending_repositories


Unnamed: 0,Repository Title,Repository Description,Contributor Counts,Language Used
0,supabase / supabase,The open source Firebase alternative. Follow t...,91,TypeScript
1,livewire / livewire,A full-stack framework for Laravel that takes ...,160,Blade
2,johnBuffer / NoCol,Trajectories finder,3,C++
3,521xueweihan / HelloGitHub,分享 GitHub 上有趣、入门级的开源项目,11,Python
4,Developer-Y / cs-video-courses,List of Computer Science courses with video le...,44,-
5,PaddlePaddle / PaddleDetection,Object detection and instance segmentation too...,55,Python
6,nextapps-de / winbox,WinBox is a professional HTML5 window manager ...,-,JavaScript
7,UnityTechnologies / open-project-1,Unity Open Project #1: Chop Chop,83,C#
8,flashlight / flashlight,A C++ standalone library for machine learning,45,C++
9,TheAlgorithms / Python,All Algorithms implemented in Python,683,Python


### Q6. Scrape the details of top 100 songs on billiboard.com.
#### Url = https://www.billboard.com/
**You have to find the following details:**

**A) Song name**

**B) Artist name**

**C) Last week rank**

**D) Peak rank**

**E) Weeks on board**

***Note: - From the home page you have to click on the charts option then hot 100-page link through code***

In [7]:
#intializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.billboard.com/
driver.get('https://www.billboard.com')

#intializing the CHARTS menu
containers = driver.find_elements_by_xpath("//li/a[normalize-space(text()) = 'Charts']")
for x in containers:
    if x.text == "CHARTS":
        chart = x
        break
        
#hovering over chart menu
ActionChains(driver).move_to_element(chart).perform()

#initializing and clicking on hot 100 link
hot_100 = driver.find_elements_by_xpath('//a[@data-track-action="main-nav" and text()="Hot 100"]')
hot_100[1].click()

#initializing top 100 songs containers
song_containers = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//ol[@class="chart-list__elements"]/li')))

#scraping song details
song_names = []
artist_names = []
last_week_ranks = []
peak_ranks = []
weeks_on_boards = []
for song in song_containers:
    #scraping song name
    try:
        song_name = song.find_element_by_xpath('.//span[@class="chart-element__information__song text--truncate color--primary"]')
        song_names.append(song_name.text)
    except NoSuchElementException:
        song_names.append('-')
        
    #scraping artist name
    try:
        artist_name = song.find_element_by_xpath('.//span[@class="chart-element__information__artist text--truncate color--secondary"]')
        artist_names.append(artist_name.text)
    except NoSuchElementException:
        artist_names.append('-')
        
    #scraping last week rank
    try:
        last_week_rank = song.find_element_by_xpath('.//div[@class="chart-element__meta text--center color--secondary text--last"]')
        last_week_ranks.append(last_week_rank.text)
    except NoSuchElementException:
        last_week_ranks.append(last_week_rank.text)
        
    #scraping peak rank
    try:
        peak_rank = song.find_element_by_xpath('.//div[@class="chart-element__meta text--center color--secondary text--peak"]')
        peak_ranks.append(peak_rank.text)
    except NoSuchElementException:
        peak_ranks.append('-')
        
    #scraping weeks on board
    try:
        weeks_on_board = song.find_element_by_xpath('.//div[@class="chart-element__meta text--center color--secondary text--week"]')
        weeks_on_boards.append(weeks_on_board.text)
    except:
        weeks_on_boards.append('-')
        
        
#closing driver
driver.close()

#storing scraped data into dataframe
df_top_100_songs = pd.DataFrame({
    "Song Name": song_names,
    "Artist Name": artist_names,
    "Last Week Rank": last_week_ranks,
    "Peak Rank": peak_ranks,
    "Weeks on Board": weeks_on_boards,
})

#interpreting scraped data
df_top_100_songs


Unnamed: 0,Song Name,Artist Name,Last Week Rank,Peak Rank,Weeks on Board
0,Rapstar,Polo G,1,1,2
1,Leave The Door Open,Silk Sonic (Bruno Mars & Anderson .Paak),3,1,7
2,Peaches,Justin Bieber Featuring Daniel Caesar & Giveon,4,1,5
3,Montero (Call Me By Your Name),Lil Nas X,2,1,4
4,Levitating,Dua Lipa Featuring DaBaby,6,5,29
...,...,...,...,...,...
104,Go!,Moneybagg Yo Featuring BIG30,-,96,1
105,Drankin N Smokin,Future & Lil Uzi Vert,-,31,12
106,Monsters,All Time Low Featuring Demi Lovato & blackbear,95,55,17
107,Slatty,Young Thug & Gunna Featuring Yak Gotti & Lil Duke,-,99,1


### Q7. Scrape the details of Data science recruiters from naukri.com.
#### Url = https://www.naukri.com/
**You have to find the following details:**
    
**A) Name**

**B) Designation**

**C) Company**

**D) Skills they hire for**

**E) Location**

***Note: - From naukri.com homepage click on the recruiters option and the on the search pane type Data science and click on search. All this should be done through code***

In [8]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.naukri.com
driver.get('https://www.naukri.com')

#clicking on recruiters option
WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//li/a[@title="Search Recruiters"]'))).click()

#storing current window handle and switching to new opened window
original_window = driver.current_window_handle
driver.switch_to.window(driver.window_handles[-1])

#initializing search box with Data science
driver.find_element_by_xpath('//input[@name="qp"]').send_keys('Data science')

#clicking on search button
driver.find_element_by_xpath('//button[@id="qsbFormBtn"]').click()

#initializing recruiter containers
recruiter_containers = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@id="tabP-1"]/div[@class="outerRecSec"]')))

#scraping the required details
names = []
designations = []
companies = []
skills_they_hires = []
locations = []
for recruiter in recruiter_containers:
    cols = recruiter.find_elements_by_class_name('recSec')
    for col in cols:
        #scraping name
        try:
            name = col.find_element_by_xpath('.//span[@class="fl ellipsis"]')
            names.append(name.text)
        except NoSuchElementException:
            names.append('-')
            
        #scraping designation
        try:
            designation = col.find_element_by_xpath('.//span[@class="ellipsis clr"]')
            designations.append(designation.text)
        except NoSuchElementException:
            designations.append('-')
            
        #scraping company
        try:
            company = col.find_element_by_xpath('.//a[@class="ellipsis"][2]')
            companies.append(company.text)
        except NoSuchElementException:
            companies.append('-')
            
        #scraping skills they hire for
        try:
            skills = col.find_element_by_xpath('.//div[@class="hireSec highlightable"]')
            skills_they_hires.append(skills.text)
        except NoSuchElementException:
            skills_they_hires.append('-')
            
        #scraping location
        try:
            location = col.find_element_by_xpath('.//small[@class="ellipsis"]')
            locations.append(location.text)
        except NoSuchElementException:
            locations.append('-')
            
#closing driver
driver.close()
driver.switch_to.window(original_window)
driver.close()

#storing scraped data into dataframe
df_recruiters = pd.DataFrame({
    "Name": names,
    "Designation": designations,
    "Company": companies,
    "Skills They Hire For": skills_they_hires,
    "Location": locations,
})

#interpreting scraped data
df_recruiters


Unnamed: 0,Name,Designation,Company,Skills They Hire For,Location
0,Aakash Harit,HR Manager,Data Science Network,"Classic ASP Developer, Internet Marketing Prof...",Delhi
1,shravan Kumar Gaddam,Company Recruiter,Shore Infotech India Pvt. Ltd,".Net, Java, Data Science, Linux Administration...",Hyderabad / Secunderabad
2,Talent Acquisition Executive,Recruitment Professional,XenonStack,"Web Designing, html5, Angular.js, seo, hadoop,...",Chandigarh
3,Anik Agrawal,Company Recruiter,Enerlytics Software Solutions Pvt Ltd,"Mean Stack, javascript, angularjs, mongodb, We...",Ahmedabad
4,MARSIAN Technologies LLP,Company HR,MARSIAN Technologies LLP,"Data Science, Artificial Intelligence, Machine...",Pune
5,subhas patel,Founder CEO,LibraryXProject,"Hadoop, Spark, Digital Strategy, Data Architec...",UK - (london)
6,Abhishek - Only Analytics Hiring - India and,Recruitment Lead Consultant,Apidel Technologies Division of Transpower,"Analytics, Business Intelligence, Business Ana...",Vadodara / Baroda
7,Institute for Financial Management and Resear,Programme Manager,IFMR,Data Science,Chennai
8,Balu Ramesh,HR Administrator,Techvantage Systems Pvt Ltd,"Machine Learning, algorithms, Go Getter, Compu...",Trivandrum
9,Asif Lucknowi,Director,Weupskill- Live Wire India,"Technical Training, Software Development, Pres...",Indore


### Q8. Scrape the details of Highest selling novels.
#### Url = https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare/

**You have to find the following details:**
    
**A) Book name**

**B) Author name**

**C) Volumes sold**

**D) Publisher**

**E) Genre**

In [9]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare
driver.get('https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare')

#initializing novel containers
novel_containers = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//table[@class="in-article sortable"]/tbody/tr')))

#scraping required details
book_names = []
author_names = []
volume_solds = []
publishers = []
genres = []

for novel in novel_containers:
    cols = novel.find_elements_by_xpath('.//td')
    
    #scraping book name
    try:
        book_names.append(cols[1].text)
    except NoSuchElementException:
        book_names.append('-')
        
    #scraping author name
    try:
        author_names.append(cols[2].text)
    except NoSuchElementException:
        author_names.append('-')
        
    #scraping volumes sold
    try:
        volume_solds.append(cols[3].text)
    except NoSuchElementException:
        volume_solds.append('-')
        
    #scraping publishers
    try:
        publishers.append(cols[4].text)
    except NoSuchElementException:
        publishers.append('-')
        
    #scraping genres
    try:
        genres.append(cols[5].text)
    except NoSuchElementException:
        genres.append('-')
        
#closing driver
driver.close()

#storing scraped data into dataframe
df_highest_selling_novels = pd.DataFrame({
    "Book Name": book_names,
    "Author Name": author_names,
    "Volumes Sold": volume_solds,
    "Publisher": publishers,
    "Genre": genres,
})

#interpreting scraped data
df_highest_selling_novels


Unnamed: 0,Book Name,Author Name,Volumes Sold,Publisher,Genre
0,"Da Vinci Code,The","Brown, Dan",5094805,Transworld,"Crime, Thriller & Adventure"
1,Harry Potter and the Deathly Hallows,"Rowling, J.K.",4475152,Bloomsbury,Children's Fiction
2,Harry Potter and the Philosopher's Stone,"Rowling, J.K.",4200654,Bloomsbury,Children's Fiction
3,Harry Potter and the Order of the Phoenix,"Rowling, J.K.",4179479,Bloomsbury,Children's Fiction
4,Fifty Shades of Grey,"James, E. L.",3758936,Random House,Romance & Sagas
...,...,...,...,...,...
95,"Ghost,The","Harris, Robert",807311,Random House,General & Literary Fiction
96,Happy Days with the Naked Chef,"Oliver, Jamie",794201,Penguin,Food & Drink: General
97,"Hunger Games,The:Hunger Games Trilogy","Collins, Suzanne",792187,Scholastic Ltd.,Young Adult Fiction
98,"Lost Boy,The:A Foster Child's Search for the L...","Pelzer, Dave",791507,Orion,Biography: General


### Q9. Scrape the details most watched tv series of all time from imdb.com.
#### Url = https://www.imdb.com/list/ls095964455/
**You have to find the following details:**
    
**A) Name**

**B) Year span**

**C) Genre**

**D) Run time**

**E) Ratings**

**F) Votes**

In [10]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://www.imdb.com/list/ls095964455/
driver.get('https://www.imdb.com/list/ls095964455/')

#initializing most watched tv series containers
series_containers = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//div[@class="lister-item-content"]')))

#scraping required details
names = []
year_spans = []
genres = []
run_times = []
ratings = []
votes = []

for series in series_containers:
    #scraping name
    try:
        name = series.find_element_by_xpath('.//h3[@class="lister-item-header"]/a')
        names.append(name.text)
    except NoSuchElementException:
        names.append('-')
        
    #scraping year span
    try:
        year_span = series.find_element_by_xpath('.//span[@class="lister-item-year text-muted unbold"]')
        year_spans.append(year_span.text)
    except NoSuchElementException:
        year_spans.append('-')
        
    #scraping genre
    try:
        genre = series.find_element_by_xpath('.//span[@class="genre"]')
        genres.append(genre.text)
    except NoSuchElementException:
        genres.append('-')
        
    #scraping run time
    try:
        run_time = series.find_element_by_xpath('.//span[@class="runtime"]')
        run_times.append(run_time.text)
    except NoSuchElementException:
        run_times.append('-')
        
    #scraping rating
    try:
        rating = series.find_element_by_xpath('.//span[@class="ipl-rating-star__rating"]')
        ratings.append(rating.text)
    except NoSuchElementException:
        ratings.append('-')
        
    #scraping vote
    try:
        vote = series.find_element_by_xpath('.//span[@name="nv"]')
        votes.append(vote.text)
    except NoSuchElementException:
        votes.append('-')
        
#closing driver
driver.close()

#storing scraped data into dataframe
df_most_watched_tv_series = pd.DataFrame({
    "Name": names,
    "Year Span": year_spans,
    "Genre": genres,
    "Run Time": run_times,
    "Ratings": ratings,
    "Votes": votes,
})

#interpreting scraped data
df_most_watched_tv_series


Unnamed: 0,Name,Year Span,Genre,Run Time,Ratings,Votes
0,Game of Thrones,(2011–2019),"Action, Adventure, Drama",57 min,9.3,1801079
1,Stranger Things,(2016– ),"Drama, Fantasy, Horror",51 min,8.7,846490
2,The Walking Dead,(2010–2022),"Drama, Horror, Thriller",44 min,8.2,867286
3,13 Reasons Why,(2017–2020),"Drama, Mystery, Thriller",60 min,7.6,260013
4,The 100,(2014–2020),"Drama, Mystery, Sci-Fi",43 min,7.6,220969
...,...,...,...,...,...,...
95,Reign,(2013–2017),"Drama, Fantasy",42 min,7.5,44188
96,A Series of Unfortunate Events,(2017–2019),"Adventure, Comedy, Drama",50 min,7.8,54639
97,Criminal Minds,(2005–2020),"Crime, Drama, Mystery",42 min,8.1,165285
98,Scream: The TV Series,(2015–2019),"Comedy, Crime, Drama",45 min,7.2,34551


### Q10. Details of Datasets from UCI machine learning repositories.
#### Url = https://archive.ics.uci.edu/
**You have to find the following details:**
    
**A) Dataset name**

**B) Data type**

**C) Task**

**D) Attribute type**

**E) No of instances**

**F) No of attribute**

**G) Year**

**Note: - from the home page you have to go to the Show All Dataset page through code**

In [11]:
#initializing webdriver with chromedriver
driver = webdriver.Chrome('chromedriver.exe')

#initializing driver with url https://archive.ics.uci.edu/
driver.get('https://archive.ics.uci.edu')

#click on link view all data sets
WebDriverWait(driver,10).until(ec.element_to_be_clickable((By.XPATH,'//b[contains(text(),"View ALL Data Sets")]'))).click()

#initializing dataset containers
dataset_containers = WebDriverWait(driver,10).until(ec.presence_of_all_elements_located((By.XPATH,'//body/table[2]/tbody/tr/td[2]/table[2]/tbody/tr')))

#scraping required details
names = []
data_types = []
tasks = []
attribute_types = []
instances = []
attributes = []
years = []

for dataset in dataset_containers[1:]:
    cols = dataset.find_elements_by_xpath('./td')
    
    #scraping dataset name
    try:
        names.append(cols[0].text.strip())
    except NoSuchElementException:
        names.append('-')
        
    #scraping data types
    try:
        data_types.append(cols[1].text.strip())
    except NoSuchElementException:
        data_types.append('-')
        
    #scraping task
    try:
        tasks.append(cols[2].text.strip())
    except NoSuchElementException:
        tasks.append('-')
        
    #scraping attribute types
    try:
        attribute_types.append(cols[3].text.strip())
    except NoSuchElementException:
        attribute_types.append('-')
        
    #scraping number of instances
    try:
        instances.append(cols[4].text.strip())
    except NoSuchElementException:
        instances.append('-')
        
    #scraping number of attributes
    try:
        attributes.append(cols[5].text.strip())
    except NoSuchElementException:
        attributes.append('-')
        
    #scraping year
    try:
        years.append(cols[6].text.strip())
    except NoSuchElementException:
        years.append('-')

#closing driver
driver.close()

#storing scraped data in dataframe
df_datasets = pd.DataFrame({
    "Dataset Name": names,
    "Data Types": data_types,
    "Task": tasks,
    "Atrribute Types": attribute_types,
    "No. of Instances": instances,
    "No. of Attributes": attributes,
    "Year": years,
})

#interpreting scraped data
df_datasets


Unnamed: 0,Dataset Name,Data Types,Task,Atrribute Types,No. of Instances,No. of Attributes,Year
0,Abalone,Multivariate,Classification,"Categorical, Integer, Real",4177,8,1995
1,Adult,Multivariate,Classification,"Categorical, Integer",48842,14,1996
2,Annealing,Multivariate,Classification,"Categorical, Integer, Real",798,38,
3,Anonymous Microsoft Web Data,,Recommender-Systems,Categorical,37711,294,1998
4,Arrhythmia,Multivariate,Classification,"Categorical, Integer, Real",452,279,1998
...,...,...,...,...,...,...,...
580,Wisesight Sentiment Corpus,"Multivariate, Text",Classification,,26737,4,2020
581,AI4I 2020 Predictive Maintenance Dataset,"Multivariate, Time-Series","Classification, Regression, Causal-Discovery",Real,10000,14,2020
582,Dry Bean Dataset,Multivariate,Classification,"Integer, Real",13611,17,2020
583,in-vehicle coupon recommendation,Multivariate,Classification,,12684,23,2020


End of Web Scraping Assignment 4|
----------------------------------------------------|