## Imports

In [1]:
#we want to install requests (if not already installed) using pip install requests, done in terminal
#requests helps us get the HTML
import requests

#we want to install beautifulsoup (if not already installed) using pip install beautifulsoup4, done in terminal
#beautifulsoup helps us read the html
from bs4 import BeautifulSoup as bs

#we want to install selenium (if not already installed) using pip install selenium, done in terminal
#import selenium
from selenium import webdriver

#time for being human-like
import time

#pandas dataframe
import pandas as pd

## Let's try to webscrape off one website first (aka, no selenium)
We will first try to get just the episode titles of Gravity Falls from an IMDB page

In [2]:
# let's make a dataframe to store our data
column_names = ["Episode Title"]
df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,Episode Title


In [3]:
# Getting page HTML through request
gravity_falls = requests.get('https://www.imdb.com/title/tt1865718/episodes?season=1&ref_=tt_eps_sn_1')
soup = bs(gravity_falls.content, 'html.parser') # Parsing content using beautifulsoup

Let's try to just get our film titles first

In [4]:
titles = soup.find_all("a", title=True, itemprop="name")
titles

[<a href="/title/tt2152239/" itemprop="name" title="Tourist Trapped">Tourist Trapped</a>,
 <a href="/title/tt2168670/" itemprop="name" title="The Legend of the Gobblewonker">The Legend of the Gobblewonker</a>,
 <a href="/title/tt2168676/" itemprop="name" title="Headhunters">Headhunters</a>,
 <a href="/title/tt2229194/" itemprop="name" title="The Hand That Rocks the Mabel">The Hand That Rocks the Mabel</a>,
 <a href="/title/tt2229196/" itemprop="name" title="The Inconveniencing">The Inconveniencing</a>,
 <a href="/title/tt2229192/" itemprop="name" title="Dipper vs. Manliness">Dipper vs. Manliness</a>,
 <a href="/title/tt2267658/" itemprop="name" title="Double Dipper">Double Dipper</a>,
 <a href="/title/tt2267660/" itemprop="name" title="Irrational Treasure">Irrational Treasure</a>,
 <a href="/title/tt2310928/" itemprop="name" title="The Time Traveler's Pig">The Time Traveler's Pig</a>,
 <a href="/title/tt2340344/" itemprop="name" title="Fight Fighters">Fight Fighters</a>,
 <a href="/tit

In [5]:
for title in titles:
    print(title.get_text())

Tourist Trapped
The Legend of the Gobblewonker
Headhunters
The Hand That Rocks the Mabel
The Inconveniencing
Dipper vs. Manliness
Double Dipper
Irrational Treasure
The Time Traveler's Pig
Fight Fighters
Little Dipper
Summerween
Boss Mabel
Bottomless Pit!
The Deep End
Carpet Diem
Boyz Crazy
Land Before Swine
Dreamscaperers
Gideon Rises


let's add to a list to add this information to our dataframe


In [6]:
episode_titles = []
for title in titles:
    episode_titles.append(title.get_text())
df["Episode Title"] = episode_titles
df

Unnamed: 0,Episode Title
0,Tourist Trapped
1,The Legend of the Gobblewonker
2,Headhunters
3,The Hand That Rocks the Mabel
4,The Inconveniencing
5,Dipper vs. Manliness
6,Double Dipper
7,Irrational Treasure
8,The Time Traveler's Pig
9,Fight Fighters


Let's try to get our episode dates now. 

In [7]:
dates = soup.find_all(class_="airdate")
dates

[<div class="airdate">
             15 Jun. 2012
     </div>,
 <div class="airdate">
             29 Jun. 2012
     </div>,
 <div class="airdate">
             30 Jun. 2012
     </div>,
 <div class="airdate">
             6 Jul. 2012
     </div>,
 <div class="airdate">
             13 Jul. 2012
     </div>,
 <div class="airdate">
             20 Jul. 2012
     </div>,
 <div class="airdate">
             10 Aug. 2012
     </div>,
 <div class="airdate">
             17 Aug. 2012
     </div>,
 <div class="airdate">
             24 Aug. 2012
     </div>,
 <div class="airdate">
             14 Sep. 2012
     </div>,
 <div class="airdate">
             28 Sep. 2012
     </div>,
 <div class="airdate">
             5 Oct. 2012
     </div>,
 <div class="airdate">
             15 Feb. 2013
     </div>,
 <div class="airdate">
             1 Mar. 2013
     </div>,
 <div class="airdate">
             15 Mar. 2013
     </div>,
 <div class="airdate">
             5 Apr. 2013
     </div>,
 <div class=

In [8]:
for date in dates:
    print(date.get_text().strip())

15 Jun. 2012
29 Jun. 2012
30 Jun. 2012
6 Jul. 2012
13 Jul. 2012
20 Jul. 2012
10 Aug. 2012
17 Aug. 2012
24 Aug. 2012
14 Sep. 2012
28 Sep. 2012
5 Oct. 2012
15 Feb. 2013
1 Mar. 2013
15 Mar. 2013
5 Apr. 2013
19 Apr. 2013
28 Jun. 2013
12 Jul. 2013
2 Aug. 2013


Great! Let's add this to our dataframe

In [9]:
episode_dates = []
for date in dates:
    episode_dates.append(date.get_text().strip())
df["Air Date"] = episode_dates
df["Air Date"] = df["Air Date"].astype('datetime64[ns]') #convert String to datetime cuz why not
df

Unnamed: 0,Episode Title,Air Date
0,Tourist Trapped,2012-06-15
1,The Legend of the Gobblewonker,2012-06-29
2,Headhunters,2012-06-30
3,The Hand That Rocks the Mabel,2012-07-06
4,The Inconveniencing,2012-07-13
5,Dipper vs. Manliness,2012-07-20
6,Double Dipper,2012-08-10
7,Irrational Treasure,2012-08-17
8,The Time Traveler's Pig,2012-08-24
9,Fight Fighters,2012-09-14


## Let's try to incorporate Selenium into our program now
We want to automate our program such that it clicks the hyperlink of each movie and gets information about its box office

In [10]:
# set up webdriver, specifically chromedriver
home_list = 'https://www.imdb.com/title/tt1865718/episodes?season=1&ref_=tt_eps_sn_1'

driver = webdriver.Chrome(executable_path='Downloads/chromedriver')
driver.get(home_list)

#we should get a pop up chrome tab 

  driver = webdriver.Chrome(executable_path='Downloads/chromedriver')


In [11]:
http = "https://www.imdb.com/"
# we will add the href onto the end of the http string
hrefs = []
for title in titles:
    hrefs.append(title.get('href'))

i=0
while i < len(df["Episode Title"]):
    url = http + hrefs[i]
    driver.get(url)
    episode_soup = bs(driver.page_source, 'html.parser')
    i += 1
    time.sleep(5)

Great! We can now go from hyperlink to hyperlink. Now, let's scrape each website's box office data

In [12]:
http = "https://www.imdb.com/"
# we will add the href onto the end of the http string
hrefs = []
for title in titles:
    hrefs.append(title.get('href'))

i=0
synopses = []
while i < len(df["Episode Title"]):
    url = http + hrefs[i]
    driver.get(url)
    episode_soup = bs(driver.page_source, 'html.parser')
    synopsis = episode_soup.find(class_="sc-16ede01-2 gXUyNh").get_text()
    synopses.append(synopsis)
    i += 1
    time.sleep(5)

In [13]:
# add to dataframe
df["Synopsis"] = synopses

In [14]:
df

Unnamed: 0,Episode Title,Air Date,Synopsis
0,Tourist Trapped,2012-06-15,"After finding a strange book in the forest, Di..."
1,The Legend of the Gobblewonker,2012-06-29,Dipper and Mabel hear rumours that a sea monst...
2,Headhunters,2012-06-30,The kids discover Stan's collection of wax scu...
3,The Hand That Rocks the Mabel,2012-07-06,Dipper and Mabel meet an intriguing new neighb...
4,The Inconveniencing,2012-07-13,Dipper tries to impress Wendy by going along w...
5,Dipper vs. Manliness,2012-07-20,"In order to become more manly, Dipper seeks th..."
6,Double Dipper,2012-08-10,Stan throws a big party to promote the Mystery...
7,Irrational Treasure,2012-08-17,"During Gravity Falls' Pioneer Day celebration,..."
8,The Time Traveler's Pig,2012-08-24,Dipper ruins a chance to impress Wendy at a fa...
9,Fight Fighters,2012-09-14,"Dipper brings a video game character, Rumble M..."
