-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
created test file, seperated requests and selenium code, updated readme
- Loading branch information
1 parent
191055b
commit 3083811
Showing
5 changed files
with
74 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,4 @@ scripts | |
scripts.tar.gz | ||
env | ||
example.py | ||
test.py | ||
debug.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,26 @@ | ||
|
||
This project is to test data scraping on IMSdB and Springfield Springfield | ||
websites. | ||
|
||
Requests: Using a URL, makes a connection to a website. Gathers data quickly. | ||
Selenium: Using a URL, makes a connection to a website and can interact with it | ||
BeautifulSoup: Parses information obtained from Requests or Selenium | ||
Pandas: Puts information from BeautifulSoup into tabular and excel forms | ||
|
||
## Information related to project | ||
Notes below for "download_all_scripts" from j2kun | ||
https://github.com/j2kun/imsdb_download_all_scripts | ||
|
||
Selenium: Build based on ChromeDriver 75.0.3770.140 | ||
https://sites.google.com/a/chromium.org/chromedriver/downloads | ||
https://sites.google.com/a/chromium.org/chromedriver/getting-started | ||
|
||
## Sources for data scraping | ||
|
||
Webscarping, using Requests | ||
|
||
Webscarping, using Selenium and Chrome Driver | ||
https://www.youtube.com/watch?time_continue=1&v=5cPOONZzflM | ||
|
||
further understanding of Selenium | ||
https://automatetheboringstuff.com/chapter11/#calibre_link-2984 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from bs4 import BeautifulSoup #webparser | ||
import requests #HTTP request, to get HTML | ||
from selenium import webdriver #Needed for dynamic scraping | ||
import os #Needed to connect to ChromeDriver | ||
|
||
"""Requesting website""" | ||
# Using requests | ||
res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php") | ||
|
||
"""Using BeautifulSoup to parse HTML as a soup object""" | ||
# Using Request res information from website as a XML | ||
soup = BeautifulSoup(res.text, "lxml") #Using requests, Type is a soup | ||
|
||
# This is a list that contains all the information in the soup, | ||
# which contains the movies titles and years | ||
outer_box = soup.find('div', {'class': "main-content-left"}) | ||
|
||
# This is a list of the movie titles and years | ||
movie_titles_list = outer_box.find_all('a', {'class': "script-list-item"}) | ||
|
||
for movie in movie_titles_list: | ||
full_name = movie.text #The actually movie title and year | ||
|
||
# using "(" as a delimiter, then taking out ")" in the year | ||
name, year = full_name.split("(")[0], full_name.split("(")[1][:-1] | ||
print(name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import time | ||
from selenium import webdriver | ||
import os | ||
|
||
driver = webdriver.Chrome(os.environ['CHROME_DRIVER']) # Optional argument, if not specified will search path. | ||
|
||
"""Google test""" | ||
# driver.get('http://www.google.com/xhtml'); | ||
# time.sleep(5) # Let the user actually see something! | ||
# search_box = driver.find_element_by_name('q') | ||
# search_box.send_keys('ChromeDriver') | ||
# search_box.submit() | ||
# time.sleep(5) # Let the user actually see something! | ||
|
||
|
||
"""Automate test""" | ||
driver.get('http://inventwithpython.com') #opens the browser | ||
time.sleep(5) # Let the user actually see something! | ||
linkElem = driver.find_element_by_link_text('Read Online for Free') #selects the link | ||
print(type(linkElem)) | ||
linkElem.click() # follows the "Read It Online" link | ||
time.sleep(5) # Let the user actually see something! | ||
|
||
driver.quit() |