In [1]:
#import the required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# task 1
# Extract all the image links from: https://en.wikipedia.org/wiki/Bowling_Green_State_University
url = "https://en.wikipedia.org/wiki/Bowling_Green_State_University"
response = requests.get(url)
content = response.content

soup = BeautifulSoup(content, "html.parser")
img_tags = soup.find_all("img") # find all image HTML elements

img_links = [img["src"] for img in img_tags] # find all img strings inside the "src" tags of each image element

for i, img_link in enumerate(img_links):
    
    # we will append the base URL in front of each image location
    print(
        f"{i + 1}: https://en.wikipedia.org/wiki/Bowling_Green_State_University{img_link}"
    )
    print()

print("There are", len(img_links), "image links on the page.")

1: https://en.wikipedia.org/wiki/Bowling_Green_State_University/static/images/icons/wikipedia.png

2: https://en.wikipedia.org/wiki/Bowling_Green_State_University/static/images/mobile/copyright/wikipedia-wordmark-en.svg

3: https://en.wikipedia.org/wiki/Bowling_Green_State_University/static/images/mobile/copyright/wikipedia-tagline-en.svg

4: https://en.wikipedia.org/wiki/Bowling_Green_State_University//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png

5: https://en.wikipedia.org/wiki/Bowling_Green_State_University//upload.wikimedia.org/wikipedia/en/thumb/a/a3/Bowling_Green_State_University_seal.svg/150px-Bowling_Green_State_University_seal.svg.png

6: https://en.wikipedia.org/wiki/Bowling_Green_State_University//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png

7: https://en.wikipedia.org/wiki/Bowling_Green_State_University//upload.wikimedia.org/wikiped

In [3]:
# task 2
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
content = response.content

soup = BeautifulSoup(content, 'html.parser')
movies = soup.find_all('tr') # get each row containing movie data

for movie in movies:
    
  # start with empty data, to verify the requirements with later
  movieName = ""
  releaseYear = None
  rating = ""
  
  titleTd = movie.select_one('.titleColumn')
  if titleTd:
    movieName = titleTd.find("a").text # get the movieName from the column with class "titleColumn"
    releaseYear = titleTd.find("span", {'class': "secondaryInfo"}).text.replace("(", "").replace(")", "") # get the movie release year and clean it
  ratingTd = movie.select_one('.ratingColumn.imdbRating')
  if ratingTd:
    strongTag = ratingTd.find('strong')
    rating = strongTag.text
  
  # if we successfully scraped all required data...
  if movieName and releaseYear and rating:
    if int(releaseYear) >= 1990 and int(releaseYear) <= 1999 and float(rating) > 8.5:
      print(movieName, rating)

The Shawshank Redemption 9.2
Schindler's List 8.9
Pulp Fiction 8.8
Forrest Gump 8.8
Fight Club 8.7
The Matrix 8.7
Goodfellas 8.7
Se7en 8.6
The Silence of the Lambs 8.6
Saving Private Ryan 8.6
Life Is Beautiful 8.6
The Green Mile 8.6


In [4]:
# task 3

# specify the website we want to scrape from
url = "https://www.worldometers.info/coronavirus/"
response = requests.get(url)
content = response.content  # the website html is now stored in this variable

soup = BeautifulSoup(content, "html.parser")
tbody = soup.find("tbody")  # get the table body containing the data we want to scrape
tr = tbody.find_all("tr")  # get all of the table rows within the table body

# used when looping through the website columns
# each will later map to column names such as totalCases, newCases, etc.
colIndexes = {
    2: [],
    3: [],
    4: [],
    5: [],
    6: [],
    7: [],
    8: [],
    9: [],
    10: [],
    11: [],
    12: [],
    13: [],
    14: [],
}
# each of the columns will eventually have an array with 231 items

countries = []  # this will be the index of the dataframe

# first, get all of the country names
for r in tr:
    countryName = r.find("a")

    if countryName:
        countries.append(countryName.text)
    else:
        countryName = r.find("span")
        if countryName:
            countries.append(countryName)
# after this block of code runs, there will be 231 countries stored in the countries array

for i in range(8, len(tr)):
    data = tr[i].find_all("td")

    # loop through each column
    for i in range(2, 15):
        colIndexes[i].append(data[i].text)

dfData = {
    "totalCases": colIndexes[2],
    "newCases": colIndexes[3],
    "totalDeaths": colIndexes[4],
    "newDeaths": colIndexes[5],
    "totalRecovered": colIndexes[6],
    "newRecovered": colIndexes[7],
    "activeCases": colIndexes[8],
    "seriousCritical": colIndexes[9],
    "totCasesPerMillion": colIndexes[10],
    "totDeathsPerMillion": colIndexes[11],
    "totalTests": colIndexes[12],
    "testsPerMillion": colIndexes[13],
    "population": colIndexes[14],
}
df = pd.DataFrame(dfData, countries)
print(df)

                 totalCases newCases  \
USA             105,396,817   +1,838   
India            44,687,820            
France           39,638,159            
Germany          38,210,850            
Brazil           37,076,053            
...                     ...      ...   
Vatican City             29            
Western Sahara           10            
[MS Zaandam]              9            
Tokelau                   5            
China               503,302            

                                              totalDeaths newDeaths  \
USA             1,146,740                                        +5   
India             530,775                                             
France            165,030                                             
Germany           168,397                                             
Brazil            699,276                                             
...                                                   ...       ...   
Vatican City          

In [5]:
# task 4

url = "https://weather.com/weather/today/l/f06848d98b2de6e4c8e057e63fa6ba308ead9c397626e2dbab996d0c687bf5a6"

# simply swap out the URL with a new weather.com URL and data will be scraped
# the above URL points to the city: Bowling Green, KY

response = requests.get(url)
content = response.content

soup = BeautifulSoup(content, "html.parser")
section1 = soup.find("section", {'class': "card"})  # get the section containing the weather today in Bowling Green, KY

city = soup.find("h1", {"class": "CurrentConditions--location--1YWj_"}).text
print("The following is a report for the city " + city)
# temperature
print("Current temp:", section1.find('span', {'data-testid': "TemperatureValue"}).text)

section2 = soup.find("div", {"class": "TodayDetailsCard--detailsContainer--2yLtL"})
print("Wind Speed:", section2.find("span", {'data-testid': "Wind"}).text[14:])
print("Humidity:", section2.find("div", {"class": "WeatherDetailsListItem--wxData--kK35q"}).find_all("span")[1].text)
print("UV Index:", section2.find_all("div", {"data-testid": "WeatherDetailsListItem"})[5].find('span', {'data-testid':"UVIndexValue"}).text)

The following is a report for the city Bowling Green, KY
Current temp: 62°
Wind Speed: 1 mph
Humidity: 46°
UV Index: 0 of 10
