# Project 2: Web Scraping and API access

In [1]:
!pip install beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

In [3]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

In [4]:
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
table = soup.find("table", {"class": "wikitable"})

In [6]:
print(table.prettify())

<table class="wikitable sortable plainrowheaders sticky-header col4right col5center col6center" style="margin:auto; margin:auto;">
 <caption>
  Highest-grossing films
  <sup class="reference" id="cite_ref-12">
   <a href="#cite_note-12">
    <span class="cite-bracket">
     [
    </span>
    12
    <span class="cite-bracket">
     ]
    </span>
   </a>
  </sup>
 </caption>
 <tbody>
  <tr>
   <th>
    Rank
   </th>
   <th>
    Peak
   </th>
   <th class="unsortable">
    Title
   </th>
   <th class="unsortable">
    Worldwide gross
   </th>
   <th>
    Year
   </th>
   <th class="unsortable">
    <abbr title="References">
     Ref
    </abbr>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td>
    1
   </td>
   <th scope="row">
    <i>
     <a href="/wiki/Avatar_(2009_film)" title="Avatar (2009 film)">
      Avatar
     </a>
    </i>
   </th>
   <td>
    $2,923,706,026
   </td>
   <td>
    2009
   </td>
   <td>
    <sup class="reference" id="cite_ref-avatar_13-0">
     <a href="#cite

### B. Using inspect element, find the html syntax for a link. 

In [7]:
links = soup.find_all("a")

### C. Using inspect element, find the html syntax for linking an image

In [9]:
images = soup.find_all("img")

## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [11]:
import bs4
import requests
import pandas as pd

In [12]:
#save and print the text content of a page with all tags removed
text_content = soup.get_text()

In [13]:
with open("page_text.txt", "w", encoding="utf-8") as file:
    file.write(text_content)

In [15]:
print(text_content)





List of highest-grossing films - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Highest-grossing films








2
Highest-grossing films adjusted for inflation








3
High-grossing films by year








4
Timeline of highest-grossing films








5
Highest-grossing franchises and film series








6
See also








7
Notes








8
References




Toggle References subsection





8.1
Box office sources









In [19]:
#download an image with beautifulsoup and save it in this repository
image = soup.find("img")
if image:
    image_src = image["src"]
    
    if image_src.startswith("//"):
        image_url = "https:" + image_src
    elif image_src.startswith("http"):
        image_url = image_src
    else:
        image_url = wiki_url + image_src  

    
    image_response = requests.get(image_url)
    image_path = "downloaded_image.jpg"
    
    with open(image_path, "wb") as file:
        file.write(image_response.content)
    
    print(f"Image downloaded and saved as {image_path}")
else:
    print("No image found on the page.")


Image downloaded and saved as downloaded_image.jpg


In [20]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links
links = soup.find_all("a")
for link in links[:10]:
    href = link.get("href", "No link available")
    print(f"{href[:100]}")

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal


## Part 3: Downloading scripts

In [21]:
scripts=pd.read_csv('pudding_data.csv')

In [22]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [23]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags
def fetch_script_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        script_text = soup.get_text()[:1000]  # Extract first 1000 characters
        return script_text
    except Exception as e:
        return f"Error fetching script: {str(e)}"

In [27]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"

In [24]:
scripts["script_text"] = scripts["link"].apply(lambda x: fetch_script_text(x) if pd.notna(x) else "No link")

In [25]:
scripts.to_csv("pudding_texts.csv", index=False)

In [26]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link,script_text
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...,Error fetching script: HTTPConnectionPool(host...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...,Error fetching script: Failed to parse: Franke...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-...",Error fetching script: Invalid URL 'film_20100...
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...,\n\n\n\n\nAmerican Madness Script at IMSDb.\n\...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...,\n\n\n\n\nGrand Hotel Script at IMSDb.\n\n\n\n...
...,...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf",Error fetching script: Invalid URL 'Pay The Gh...
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...,MediumOpen in appSign upSign inWriteSign upSig...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf,Error fetching script: Failed to parse: Dope (...
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...,MediumOpen in appSign upSign inWriteSign upSig...


## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [30]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 
api_key = "c522a7287e5078a789b5eab3d59342a3"
tmdb_url = f"https://api.themoviedb.org/3/movie/now_playing?api_key={api_key}&language=en-US&page=1"


In [31]:
response = requests.get(tmdb_url)
data = response.json()

In [32]:
movies = []
for movie in data['results']:
    movies.append({
        "Title": movie['title'],
        "Release Date": movie['release_date'],
        "Overview": movie['overview'],
        "Popularity": movie['popularity'],
        "Vote Average": movie['vote_average'],
        "Poster Path": movie['poster_path']
    })

In [33]:
tmdb_movies_df = pd.DataFrame(movies)
tmdb_movies_df.to_csv("tmdb_movies.csv", index=False)

In [34]:
tmdb_movies_df.head()

Unnamed: 0,Title,Release Date,Overview,Popularity,Vote Average,Poster Path
0,The Gorge,2025-02-13,Two highly trained operatives grow close from ...,3135.209,7.8,/7iMBZzVZtG0oBug4TfqDb9ZxAOa.jpg
1,Flight Risk,2025-01-22,A U.S. Marshal escorts a government witness to...,2568.332,6.0,/q0bCG4NX32iIEsRFZqRtuvzNCyZ.jpg
2,Sonic the Hedgehog 3,2024-12-19,"Sonic, Knuckles, and Tails reunite against a p...",1976.892,7.7,/d8Ryb8AunYAuycVKDp5HpdWPKgC.jpg
3,Captain America: Brave New World,2025-02-12,After meeting with newly elected U.S. Presiden...,1295.113,6.2,/pzIddUEMWhWzfvLI3TwxUG2wGoi.jpg
4,Companion,2025-01-22,During a weekend getaway at a secluded lakesid...,1222.243,7.1,/oCoTgC3UyWGfyQ9thE10ulWR7bn.jpg


In [36]:
#download the movie posters for 10 of these movies and save them to this repository
poster_filenames = []

for i, poster_path in enumerate(tmdb_movies_df["Poster Path"].dropna()[:10]):
    image_url = f"https://image.tmdb.org/t/p/w500{poster_path}"
    image_response = requests.get(image_url)
    
    poster_filename = f"movie_{i+1}.jpg"
    with open(poster_filename, "wb") as file:
        file.write(image_response.content)
    
    poster_filenames.append(poster_filename)
    print(f"Downloaded: {poster_filename}")

Downloaded: movie_1.jpg
Downloaded: movie_2.jpg
Downloaded: movie_3.jpg
Downloaded: movie_4.jpg
Downloaded: movie_5.jpg
Downloaded: movie_6.jpg
Downloaded: movie_7.jpg
Downloaded: movie_8.jpg
Downloaded: movie_9.jpg
Downloaded: movie_10.jpg


Citations:

OpenAI. (2025). ChatGPT (March 4, 2025) [Large language model]. OpenAI. Retrieved from https://openai.com/chatgpt