request and beautiful soup of springfield website

YusufBritton1990 · Jul 24, 2019 · 083d3f2 · 083d3f2
1 parent d42c0d7
commit 083d3f2
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 34 deletions.
diff --git a/god_count.py b/god_count.py
@@ -2,21 +2,42 @@
 import re #Regex
 import pandas as pd
 import numpy as np
+import glob #Used to select all txt filenames
 
 """Pathing"""
+
+# Test 1: Adding multiple scripts into script_list
+# Successful
+# script_list = []
+# script_names = ['12-Monkeys','Carrie','Catch-Me-If-You-Can' ,'Cell,-The']
+#
+# script = os.path.join(os.getcwd(), 'scripts\\12-Monkeys.txt')
+# script2 = os.path.join(os.getcwd(), 'scripts\\Carrie.txt')
+# script3 = os.path.join(os.getcwd(), 'scripts\\Catch-Me-If-You-Can.txt')
+# script4 = os.path.join(os.getcwd(), 'scripts\\Cell,-The.txt')
+#
+#
+# script_list.extend([script,script2,script3, script4])
+
+# TODO: Need to pull all scripts (currently 700's). Use glob to accomplish
+
+test = glob.glob("scripts/*") # returns "scripts//<movie>.txt"
+
 script_list = []
-script_names = ['12-Monkeys','Carrie','Catch-Me-If-You-Can' ,'Cell,-The' ,'Kafka']
+script_names = [script.split("\\")[1][:-4] for script in test]#return movie, without txt
+
+
+
+# for script in test:
+#     print(script.split("\\")[1][:-4]) #return movie, without txt
+#     break
+
+print(script_names)
+
+
 
-script = os.path.join(os.getcwd(), 'scripts\\12-Monkeys.txt')
-script2 = os.path.join(os.getcwd(), 'scripts\\Carrie.txt')
-script3 = os.path.join(os.getcwd(), 'scripts\\Catch-Me-If-You-Can.txt')
-script4 = os.path.join(os.getcwd(), 'scripts\\Cell,-The.txt')
-script5 = os.path.join(os.getcwd(), 'scripts\\Kafka.txt')
 
-# Adding multiple scripts into script_list
-script_list.extend([script,script2,script3, script4,script5])
 
-# TODO: Need to pull all scripts (currently 700's). Use glob to accomplish
 # TODO: Pull from springfield source
 # https://www.springfieldspringfield.co.uk/movie_scripts.php?order=0
 
@@ -30,30 +51,32 @@
 christ_series = []
 
 """Counting words"""
-for script in script_list:
-    with open(script, 'r', encoding="utf-8") as file:
-        data = file.read()
-
-        # /b is an empty string
-        # Using re.finditer, counting all the matches in the scripts
-        god_tick = sum(1 for match in re.finditer(r"\bGod\b", data))
-        jesus_tick = sum(1 for match in re.finditer(r"\bJesus\b", data))
-        christ_tick = sum(1 for match in re.finditer(r"\bChrist\b", data))
-
-        print(f"God: {god_tick}, Jesus: {jesus_tick}, Christ: {christ_tick} in this script")
-
-        god_series.append(god_tick)
-        jesus_series.append(jesus_tick)
-        christ_series.append(christ_tick)
-
-        god_count += god_tick
-        jesus_count += jesus_tick
-        christ_count += christ_tick
+# for script in script_list:
+#     with open(script, 'r', encoding="utf-8") as file:
+#         data = file.read()
+#
+#         # /b is an empty string
+#         # Using re.finditer, counting all the matches in the scripts
+#         god_tick = sum(1 for match in re.finditer(r"\bGod\b", data))
+#         jesus_tick = sum(1 for match in re.finditer(r"\bJesus\b", data))
+#         christ_tick = sum(1 for match in re.finditer(r"\bChrist\b", data))
+#
+#         # print(f"God: {god_tick}, Jesus: {jesus_tick}, Christ: {christ_tick} in this script")
+#
+#         god_series.append(god_tick)
+#         jesus_series.append(jesus_tick)
+#         christ_series.append(christ_tick)
+#
+#         god_count += god_tick
+#         jesus_count += jesus_tick
+#         christ_count += christ_tick
 
-        print(f"God shows up {god_count} times, Jesus {jesus_count}, and Christ {christ_count}")
-print(f"God list: {god_series} times, Jesus list: {jesus_series}, and Christ list: {christ_series}")
+#         print(f"God shows up {god_count} times, Jesus {jesus_count}, and Christ {christ_count}")
+# print(f"God list: {god_series} times, Jesus list: {jesus_series}, and Christ list: {christ_series}")
 
 """Dataframe"""
-data = {'scripts': script_names, 'god_count': god_series,'jesus_count': jesus_series,'christ_count': christ_series}
-df = pd.DataFrame(data)
-df.to_csv(os.path.join(os.getcwd(), 'movies.csv'), index=False)
+# data = {'scripts': script_names, 'god_count': god_series,'jesus_count': jesus_series,'christ_count': christ_series}
+# print(data)
+#
+# df = pd.DataFrame(data)
+# df.to_csv(os.path.join(os.getcwd(), 'movies.csv'), index=False)
diff --git a/movies.csv b/movies.csv
@@ -3,4 +3,3 @@ scripts,god_count,jesus_count,christ_count
 Carrie,14,12,2
 Catch-Me-If-You-Can,4,3,0
 "Cell,-The",5,3,2
-Kafka,2,0,0
diff --git a/springfield.py b/springfield.py
@@ -0,0 +1,13 @@
+from bs4 import BeautifulSoup
+import requests
+
+# requesting website
+res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")
+
+# using soup to parse information from HTML as XML
+soup = BeautifulSoup(res.text, "lxml")
+
+# This is a list that contains all the information in a block of code
+# This contains the movies
+outer_box = soup.find('div', {'class': "main-content-left"})
+print(outer_box)