Skip to content

Commit

Permalink
request and beautiful soup of springfield website
Browse files Browse the repository at this point in the history
  • Loading branch information
YusufBritton1990 committed Jul 24, 2019
1 parent d42c0d7 commit 083d3f2
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 34 deletions.
89 changes: 56 additions & 33 deletions god_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,42 @@
import re #Regex
import pandas as pd
import numpy as np
import glob #Used to select all txt filenames

"""Pathing"""

# Test 1: Adding multiple scripts into script_list
# Successful
# script_list = []
# script_names = ['12-Monkeys','Carrie','Catch-Me-If-You-Can' ,'Cell,-The']
#
# script = os.path.join(os.getcwd(), 'scripts\\12-Monkeys.txt')
# script2 = os.path.join(os.getcwd(), 'scripts\\Carrie.txt')
# script3 = os.path.join(os.getcwd(), 'scripts\\Catch-Me-If-You-Can.txt')
# script4 = os.path.join(os.getcwd(), 'scripts\\Cell,-The.txt')
#
#
# script_list.extend([script,script2,script3, script4])

# TODO: Need to pull all scripts (currently 700's). Use glob to accomplish

test = glob.glob("scripts/*") # returns "scripts//<movie>.txt"

script_list = []
script_names = ['12-Monkeys','Carrie','Catch-Me-If-You-Can' ,'Cell,-The' ,'Kafka']
script_names = [script.split("\\")[1][:-4] for script in test]#return movie, without txt



# for script in test:
# print(script.split("\\")[1][:-4]) #return movie, without txt
# break

print(script_names)



script = os.path.join(os.getcwd(), 'scripts\\12-Monkeys.txt')
script2 = os.path.join(os.getcwd(), 'scripts\\Carrie.txt')
script3 = os.path.join(os.getcwd(), 'scripts\\Catch-Me-If-You-Can.txt')
script4 = os.path.join(os.getcwd(), 'scripts\\Cell,-The.txt')
script5 = os.path.join(os.getcwd(), 'scripts\\Kafka.txt')

# Adding multiple scripts into script_list
script_list.extend([script,script2,script3, script4,script5])

# TODO: Need to pull all scripts (currently 700's). Use glob to accomplish
# TODO: Pull from springfield source
# https://www.springfieldspringfield.co.uk/movie_scripts.php?order=0

Expand All @@ -30,30 +51,32 @@
christ_series = []

"""Counting words"""
for script in script_list:
with open(script, 'r', encoding="utf-8") as file:
data = file.read()

# /b is an empty string
# Using re.finditer, counting all the matches in the scripts
god_tick = sum(1 for match in re.finditer(r"\bGod\b", data))
jesus_tick = sum(1 for match in re.finditer(r"\bJesus\b", data))
christ_tick = sum(1 for match in re.finditer(r"\bChrist\b", data))

print(f"God: {god_tick}, Jesus: {jesus_tick}, Christ: {christ_tick} in this script")

god_series.append(god_tick)
jesus_series.append(jesus_tick)
christ_series.append(christ_tick)

god_count += god_tick
jesus_count += jesus_tick
christ_count += christ_tick
# for script in script_list:
# with open(script, 'r', encoding="utf-8") as file:
# data = file.read()
#
# # /b is an empty string
# # Using re.finditer, counting all the matches in the scripts
# god_tick = sum(1 for match in re.finditer(r"\bGod\b", data))
# jesus_tick = sum(1 for match in re.finditer(r"\bJesus\b", data))
# christ_tick = sum(1 for match in re.finditer(r"\bChrist\b", data))
#
# # print(f"God: {god_tick}, Jesus: {jesus_tick}, Christ: {christ_tick} in this script")
#
# god_series.append(god_tick)
# jesus_series.append(jesus_tick)
# christ_series.append(christ_tick)
#
# god_count += god_tick
# jesus_count += jesus_tick
# christ_count += christ_tick

print(f"God shows up {god_count} times, Jesus {jesus_count}, and Christ {christ_count}")
print(f"God list: {god_series} times, Jesus list: {jesus_series}, and Christ list: {christ_series}")
# print(f"God shows up {god_count} times, Jesus {jesus_count}, and Christ {christ_count}")
# print(f"God list: {god_series} times, Jesus list: {jesus_series}, and Christ list: {christ_series}")

"""Dataframe"""
data = {'scripts': script_names, 'god_count': god_series,'jesus_count': jesus_series,'christ_count': christ_series}
df = pd.DataFrame(data)
df.to_csv(os.path.join(os.getcwd(), 'movies.csv'), index=False)
# data = {'scripts': script_names, 'god_count': god_series,'jesus_count': jesus_series,'christ_count': christ_series}
# print(data)
#
# df = pd.DataFrame(data)
# df.to_csv(os.path.join(os.getcwd(), 'movies.csv'), index=False)
1 change: 0 additions & 1 deletion movies.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ scripts,god_count,jesus_count,christ_count
Carrie,14,12,2
Catch-Me-If-You-Can,4,3,0
"Cell,-The",5,3,2
Kafka,2,0,0
13 changes: 13 additions & 0 deletions springfield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from bs4 import BeautifulSoup
import requests

# requesting website
res = requests.get("https://www.springfieldspringfield.co.uk/movie_scripts.php")

# using soup to parse information from HTML as XML
soup = BeautifulSoup(res.text, "lxml")

# This is a list that contains all the information in a block of code
# This contains the movies
outer_box = soup.find('div', {'class': "main-content-left"})
print(outer_box)

0 comments on commit 083d3f2

Please sign in to comment.