In [19]:
import requests
import pymongo
import json
from bs4 import BeautifulSoup
import re
import time
import os


# Wikipedia URL for highest-grossing films
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

# Headers for requests
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def get_wikipedia_page(url):
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print("Failed to retrieve the page")
        return None
    return BeautifulSoup(response.text, 'html.parser')


def clean_numeric(value):
    return int(re.sub(r'\D', '', value))  # Remove all non-numeric characters

def extract_film_data():
    soup = get_wikipedia_page(URL)
    
    table = soup.find("table", {"class": "wikitable"})
    rows = table.find_all("tr")  # Skipping header row
    films = []
    
    for row in rows:
        cols = row.find_all("td")
        title_col = row.find("th")
        if len(cols) < 4:
            continue
        
        rank = clean_numeric(cols[0].get_text(strip=True))
        peak = clean_numeric(cols[1].get_text(strip=True))
        revenue = clean_numeric(cols[2].get_text(strip=True))
        year = clean_numeric(cols[3].get_text(strip=True))
        
        film_page_link = title_col.find("a")
        title = title_col.get_text(strip=True)
        film_url = "https://en.wikipedia.org" + film_page_link["href"] if film_page_link else None
        
        print(rank, peak, title, revenue, year)
        print(film_url)
      
        if film_url:
            director, country = get_film_details(film_url)
        else:
            director, country = "Unknown", "Unknown"
        
        films.append({"title": title, "release_year": year, "director": director, "revenue": revenue, "country": country})
        time.sleep(0.1)  # Avoid hitting Wikipedia too frequently
    
    return films

def get_film_details(film_url):
    soup = get_wikipedia_page(film_url)
    if soup is None:
        return "Unknown", "Unknown"
    
    info_box = soup.find("table", {"class": "infobox"})
    director, country = "Unknown", "Unknown"
    
    if info_box:
        rows = info_box.find_all("tr")
        for row in rows:
            header = row.find("th")
            if not header:
                continue
            
            if "Directed by" in header.text:
                director_data = row.find("td")
                if director_data:
                    directors = [li.get_text(strip=True) for li in director_data.find_all("li")]
                    director = directors[0] if directors else director_data.get_text(strip=True)
                
            if "Country" in header.text or "Countries" in header.text:
                country_data = row.find("td")
                if country_data:
                    countries = [li.get_text(strip=True) for li in country_data.find_all("li")]
                    country = countries[0] if countries else country_data.get_text(strip=True)
    
    return director, country

In [20]:
def save_data_to_json(films):
    with open("films_data.json", "w", encoding="utf-8") as f:
        json.dump(films, f, indent=4)
    print("Data saved to films_data.json for testing.")

def load_data_from_json():
    if os.path.exists("films_data.json"):
        with open("films_data.json", "r", encoding="utf-8") as f:
            films = json.load(f)
        print("Data loaded from films_data.json.")
        return films
    return None

In [27]:
ATLAS_URI = "mongodb+srv://bodashkaxdgg:uL9f1Yj8frIY7zua@cluster0.idr3e.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

class AtlasClient:
    def __init__(self, atlas_uri, dbname):
        self.mongodb_client = pymongo.MongoClient(atlas_uri)
        self.database = self.mongodb_client[dbname]

    def ping(self):
        self.mongodb_client.admin.command("ping")

    def get_collection(self, collection_name):
        return self.database[collection_name]

    def insert_many(self, collection_name, data):
        collection = self.get_collection(collection_name)
        collection.insert_many(data)
        print("Data inserted successfully.")

    def delete_all(self, collection_name):
        collection = self.get_collection(collection_name)
        collection.delete_many({})
        print("All documents deleted.")

    def find(self, collection_name, filter={}, limit=0):
        collection = self.get_collection(collection_name)
        return list(collection.find(filter, limit=limit))
    
atlas_client = AtlasClient(ATLAS_URI, "movies_db")

In [28]:
def main():
    films = load_data_from_json()
    if films is None:
        films = extract_film_data()
        save_data_to_json(films)
    atlas_client.insert_many("highest_grossing_films", films)

if __name__ == "__main__":
    main()

Data loaded from films_data.json.
Data inserted successfully.
