# Project 2: Web Scraping and API access

In [1]:
!pip install beautifulsoup4



## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

In [None]:
html_table = """
<table class="wikitable">
    <caption>Dog breeds</caption>
    <thead>
        <tr>
            <th>Breed</th>
            <th>Origin</th>
            <th>Size</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>Labrador Retriever</td>
            <td>Canada</td>
            <td>Large</td>
        </tr>
        <tr>
            <td>German Shepherd</td>
            <td>Germany</td>
            <td>Large</td>
        </tr>
        <tr>
            <td>Beagle</td>
            <td>United Kingdom</td>
            <td>Medium</td>
        </tr>
    </tbody>
</table>
"""
print(html_table)

### B. Using inspect element, find the html syntax for a link. 

In [None]:
html_link = '<a href="https://en.wikipedia.org/wiki/Dog">Dog Wikipedia Page</a>'
print(html_link)

### C. Using inspect element, find the html syntax for linking an image

In [None]:
html_image_link = """
<a href="https://en.wikipedia.org/wiki/Dog">
    <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/a3/June_odd-eyed-cat.jpg/220px-June_odd-eyed-cat.jpg" alt="Dog Image">
</a>
"""
print(html_image_link)

## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [2]:
import bs4
import requests
import pandas as pd

In [None]:
#save and print the text content of a page with all tags removed

In [3]:
# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Dog"
response = requests.get(url)

# Parse the HTML content
soup = bs4.BeautifulSoup(response.content, 'html.parser')

# Extract the text content and remove all tags
text_content = soup.get_text()

# Save the text content to a file
with open("dog_wikipedia_page.txt", "w", encoding="utf-8") as file:
    file.write(text_content)

# Print the text content
print(text_content)





Dog - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Taxonomy




Toggle Taxonomy subsection





1.1
Domestication








1.2
Breeds










2
Anatomy and physiology




Toggle Anatomy and physiology subsection





2.1
Size and skeleton








2.2
Senses








2.3
Coat








2.4
Dewclaw








2.5
Tail










3
Health




Toggle Health subsection





3.1
Lifespan








3.2
Reproduction






3.2.1
Ne

In [None]:
#download an image with beautifulsoup and save it in this repository

In [8]:
# Find the image URL
image_tag = soup.find('img')
image_url = "https://wikipedia.org"+image_tag['src']

# Download the image
image_response = requests.get(image_url)

# Save the image to a file
with open("dog_image.jpg", "wb") as file:
    file.write(image_response.content)

print("Image downloaded and saved as dog_image.jpg")

Image downloaded and saved as dog_image.jpg


In [None]:
# Find all the links on the page
links = soup.find_all('a', href=True)

# Print the first 100 characters of ten of these links
for link in links[:10]:
    print(link['href'][:100])

In [None]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links

## Part 3: Downloading scripts

In [5]:
scripts=pd.read_csv('pudding_data.csv')

In [6]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [None]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags

In [None]:
# Initialize the lists to store titles and text content
titles = []
texts = []

# Iterate over the links in the "link" column
for link in scripts['link']:
    # Fetch the page content
    response = requests.get(link)
    
    # Parse the HTML content
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text content and remove all tags
    text_content = soup.get_text()[:1000]
    
    # Append the title and text content to the lists
    titles.append(soup.title.string if soup.title else 'No Title')
    texts.append(text_content)

# Add the new columns to the dataframe
scripts['title'] = titles
scripts['text'] = texts

# Save the new dataframe to a CSV file
scripts.to_csv("pudding_texts.csv", index=False)

In [None]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"

In [None]:
df.to_csv("pudding_texts.csv")

## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [None]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 

In [16]:
!pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.9.0-py3-none-any.whl.metadata (8.0 kB)
Downloading tmdbv3api-1.9.0-py3-none-any.whl (25 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.9.0


In [None]:
#download the movie posters for 10 of these movies and save them to this repository

https://tse1.mm.bing.net/th?id=OIP.iqoDQsNxl8zDsGvFPg0gmgHaEK&pid=Api
https://tse4.mm.bing.net/th?id=OIP.TEMocs9FOYRFPnTbYRW1TQHaK9&pid=Api
https://tse4.mm.bing.net/th?id=OIP.8Yx2iEGKtFPmMiZrYyPGmgHaLu&pid=Api
https://tse2.mm.bing.net/th?id=OIP.BmtTfFXWtibjNmzU1SWrMQHaK3&pid=Api
https://tse4.mm.bing.net/th?id=OIP.8L6VgMVHMqpRJzjVN5nYxQHaK8&pid=Api
https://tse4.mm.bing.net/th?id=OIP.fF7WQgxVewyUTq13Bis1AgHaK9&pid=Api
https://tse1.mm.bing.net/th?id=OIP.wPOYV0rJNlrPSlMr-JGgvwHaK9&pid=Api
https://tse2.mm.bing.net/th?id=OIP.VqLl7oLBwGhvEySITZcklQHaLH&pid=Api
https://tse4.mm.bing.net/th?id=OIP.BIOJXfcr_i0ifZxYL8lUbAHaHa&pid=Api
https://tse3.mm.bing.net/th?id=OIF.7ghdH%2BSDdSvm%2Fb4%2BbuIN%2Fw&pid=Api

