Skip to content

Commit

Permalink
created test file, seperated requests and selenium code, updated readme
Browse files Browse the repository at this point in the history
  • Loading branch information
YusufBritton1990 committed Jul 30, 2019
1 parent 191055b commit 3083811
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ scripts
scripts.tar.gz
env
example.py
test.py
debug.log
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@

This project is to test data scraping on IMSdB and Springfield Springfield
websites.

Requests: Using a URL, makes a connection to a website. Gathers data quickly.
Selenium: Using a URL, makes a connection to a website and can interact with it
BeautifulSoup: Parses information obtained from Requests or Selenium
Pandas: Puts information from BeautifulSoup into tabular and excel forms

## Information related to project
Notes below for "download_all_scripts" from j2kun
https://github.com/j2kun/imsdb_download_all_scripts

Selenium: Build based on ChromeDriver 75.0.3770.140
https://sites.google.com/a/chromium.org/chromedriver/downloads
https://sites.google.com/a/chromium.org/chromedriver/getting-started

## Sources for data scraping

Webscarping, using Requests

Webscarping, using Selenium and Chrome Driver
https://www.youtube.com/watch?time_continue=1&v=5cPOONZzflM

further understanding of Selenium
https://automatetheboringstuff.com/chapter11/#calibre_link-2984
26 changes: 26 additions & 0 deletions springfield-requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from bs4 import BeautifulSoup #webparser
import requests #HTTP request, to get HTML
from selenium import webdriver #Needed for dynamic scraping
import os #Needed to connect to ChromeDriver

"""Requesting website"""
# Using requests
res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")

"""Using BeautifulSoup to parse HTML as a soup object"""
# Using Request res information from website as a XML
soup = BeautifulSoup(res.text, "lxml") #Using requests, Type is a soup

# This is a list that contains all the information in the soup,
# which contains the movies titles and years
outer_box = soup.find('div', {'class': "main-content-left"})

# This is a list of the movie titles and years
movie_titles_list = outer_box.find_all('a', {'class': "script-list-item"})

for movie in movie_titles_list:
full_name = movie.text #The actually movie title and year

# using "(" as a delimiter, then taking out ")" in the year
name, year = full_name.split("(")[0], full_name.split("(")[1][:-1]
print(name)
14 changes: 4 additions & 10 deletions springfield.py → springfield-selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,16 @@
from selenium import webdriver #Needed for dynamic scraping
import os #Needed to connect to ChromeDriver

# This will run chrome
"""Using Selenium to run chrome"""
driver = webdriver.Chrome(os.environ['CHROME_DRIVER'])

driver.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")

# requesting website
# Using requests
# res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")

"""Requesting website"""
# Using Selenium, which will return HTML
res = driver.execute_script("return document.documentElement.outerHTML")

# using soup to parse information from website as a XML
# soup = BeautifulSoup(res.text, "lxml") #Using requests, Type is a soup
"""Using BeautifulSoup to parse HTML as a soup object"""
# Using Selenium res to parse information
soup = BeautifulSoup(res, "lxml") #Using selenium, Type is a soup
driver.quit() #this will close window

Expand All @@ -27,8 +23,6 @@

# This is a list of the movie titles and years
movie_titles_list = outer_box.find_all('a', {'class': "script-list-item"})
# print(movie_titles_list)
# print(type(movie_titles_list))

for movie in movie_titles_list:
full_name = movie.text #The actually movie title and year
Expand Down
24 changes: 24 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import time
from selenium import webdriver
import os

driver = webdriver.Chrome(os.environ['CHROME_DRIVER']) # Optional argument, if not specified will search path.

"""Google test"""
# driver.get('http://www.google.com/xhtml');
# time.sleep(5) # Let the user actually see something!
# search_box = driver.find_element_by_name('q')
# search_box.send_keys('ChromeDriver')
# search_box.submit()
# time.sleep(5) # Let the user actually see something!


"""Automate test"""
driver.get('http://inventwithpython.com') #opens the browser
time.sleep(5) # Let the user actually see something!
linkElem = driver.find_element_by_link_text('Read Online for Free') #selects the link
print(type(linkElem))
linkElem.click() # follows the "Read It Online" link
time.sleep(5) # Let the user actually see something!

driver.quit()

0 comments on commit 3083811

Please sign in to comment.