created test file, seperated requests and selenium code, updated readme

YusufBritton1990 · Jul 30, 2019 · 3083811 · 3083811
1 parent 191055b
commit 3083811
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,4 @@ scripts
 scripts.tar.gz
 env
 example.py
-test.py
+debug.log
diff --git a/README.md b/README.md
@@ -1,7 +1,26 @@
 
+This project is to test data scraping on IMSdB and Springfield Springfield
+websites.
+
+Requests: Using a URL, makes a connection to a website. Gathers data quickly.
+Selenium: Using a URL, makes a connection to a website and can interact with it
+BeautifulSoup: Parses information obtained from Requests or Selenium
+Pandas: Puts information from BeautifulSoup into tabular and excel forms
+
+## Information related to project
 Notes below for "download_all_scripts" from j2kun
 https://github.com/j2kun/imsdb_download_all_scripts
 
 Selenium: Build based on ChromeDriver 75.0.3770.140
 https://sites.google.com/a/chromium.org/chromedriver/downloads
 https://sites.google.com/a/chromium.org/chromedriver/getting-started
+
+## Sources for data scraping
+
+Webscarping, using Requests
+
+Webscarping, using Selenium and Chrome Driver
+https://www.youtube.com/watch?time_continue=1&v=5cPOONZzflM
+
+further understanding of Selenium
+https://automatetheboringstuff.com/chapter11/#calibre_link-2984
diff --git a/springfield-requests.py b/springfield-requests.py
@@ -0,0 +1,26 @@
+from bs4 import BeautifulSoup #webparser
+import requests #HTTP request, to get HTML
+from selenium import webdriver #Needed for dynamic scraping
+import os #Needed to connect to ChromeDriver
+
+"""Requesting website"""
+# Using requests
+res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")
+
+"""Using BeautifulSoup to parse HTML as a soup object"""
+# Using Request res information from website as a XML
+soup = BeautifulSoup(res.text, "lxml") #Using requests, Type is a soup
+
+# This is a list that contains all the information in the soup,
+# which contains the movies titles and years
+outer_box = soup.find('div', {'class': "main-content-left"})
+
+# This is a list of the movie titles and years
+movie_titles_list = outer_box.find_all('a', {'class': "script-list-item"})
+
+for movie in movie_titles_list:
+    full_name = movie.text #The actually movie title and year
+
+    # using "(" as a delimiter, then taking out ")" in the year
+    name, year = full_name.split("(")[0], full_name.split("(")[1][:-1]
+    print(name)
diff --git a/springfield.py → springfield-selenium.py b/springfield.py → springfield-selenium.py
@@ -3,20 +3,16 @@
 from selenium import webdriver #Needed for dynamic scraping
 import os #Needed to connect to ChromeDriver
 
-# This will run chrome
+"""Using Selenium to run chrome"""
 driver = webdriver.Chrome(os.environ['CHROME_DRIVER'])
-
 driver.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")
 
-# requesting website
-# Using requests
-# res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")
-
+"""Requesting website"""
 # Using Selenium, which will return HTML
 res = driver.execute_script("return document.documentElement.outerHTML")
 
-# using soup to parse information from website as a XML
-# soup = BeautifulSoup(res.text, "lxml") #Using requests, Type is a soup
+"""Using BeautifulSoup to parse HTML as a soup object"""
+# Using Selenium res to parse information
 soup = BeautifulSoup(res, "lxml") #Using selenium, Type is a soup
 driver.quit() #this will close window
 
@@ -27,8 +23,6 @@
 
 # This is a list of the movie titles and years
 movie_titles_list = outer_box.find_all('a', {'class': "script-list-item"})
-# print(movie_titles_list)
-# print(type(movie_titles_list))
 
 for movie in movie_titles_list:
     full_name = movie.text #The actually movie title and year

diff --git a/test.py b/test.py
@@ -0,0 +1,24 @@
+import time
+from selenium import webdriver
+import os
+
+driver = webdriver.Chrome(os.environ['CHROME_DRIVER'])  # Optional argument, if not specified will search path.
+
+"""Google test"""
+# driver.get('http://www.google.com/xhtml');
+# time.sleep(5) # Let the user actually see something!
+# search_box = driver.find_element_by_name('q')
+# search_box.send_keys('ChromeDriver')
+# search_box.submit()
+# time.sleep(5) # Let the user actually see something!
+
+
+"""Automate test"""
+driver.get('http://inventwithpython.com') #opens the browser
+time.sleep(5) # Let the user actually see something!
+linkElem = driver.find_element_by_link_text('Read Online for Free') #selects the link
+print(type(linkElem))
+linkElem.click() # follows the "Read It Online" link
+time.sleep(5) # Let the user actually see something!
+
+driver.quit()