In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [11]:
repo = 'ada-2024-project-adarable'

In [12]:
cd {repo}

/content/drive/MyDrive/ada-2024-project-adarable


In [None]:
!pip install requests beautifulsoup4 selenium webdriver_manager

Collecting selenium
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.26.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Install the latest version of Google Chrome
!apt-get update
!apt-get install -y wget curl unzip
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt --fix-broken install -y


In [None]:
# Install ChromeDriver using webdriver_manager
!pip install -q webdriver_manager

In [None]:
# Automatically reload all modules before executing code
%load_ext autoreload
%autoreload 2

In [22]:
import pandas as pd
movies = pd.read_csv('data/data_to_scrape/left_to_scrape1.csv')

In [23]:
movies.shape[0]

1168

In [15]:
movies.head(1)

Unnamed: 0,wikipedia_movie_id
0,4227


In [24]:
movies.dtypes

Unnamed: 0,0
wikipedia_movie_id,int64


In [25]:
import csv
import os
from imdb_scraper import ImdbScraper

# Initialize the scraper object
scraper = ImdbScraper()

movies_id = movies['wikipedia_movie_id']

# Function to get the checkpoint from a previous run
def get_checkpoint():
    if os.path.exists('checkpoint.txt'):
        with open('checkpoint.txt', 'r') as f:
            last_processed_movie_id = f.read().strip()
            return last_processed_movie_id
    return None

# Function to update the checkpoint after processing a movie
def update_checkpoint(movie_id):
    with open('checkpoint.txt', 'w') as f:
        f.write(str(movie_id))

def scrape_and_save_movie_data(movies):
    # Define CSV file path
    csv_file_path = "data/imdb_additional_movies_correct_left_to_scrape.csv"

    # Column names for the CSV file
    fieldnames = [
        "wikipedia_movie_id", "movie_box_office_revenue", "budget",
        "opening_weekend", "rating_score", "producer", "release_year"
    ]

    checkpoint = get_checkpoint()  # Get the checkpoint from the last run
    if checkpoint:
        print(f"Resuming from movie ID {checkpoint}")
    else:
        print("Starting from the beginning.")

    # Start from the movie after the checkpoint
    start_index = 0
    if checkpoint:
        # Get the index of the last processed movie
        try:
            start_index = movies[movies['wikipedia_movie_id'] == int(checkpoint)].index[0] + 1
        except IndexError:
            print("Checkpoint movie not found, starting from the first movie.")
            start_index = 0

    # Check if the file exists and open the file for appending
    file_exists = os.path.exists(csv_file_path)
    print(f"File exists: {file_exists}")  # Debug: Check if the file exists

    with open(csv_file_path, mode="a", newline='', encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # If the file is being created for the first time (not exists or empty), write the header
        if not file_exists or os.path.getsize(csv_file_path) == 0:
            print("Writing header to file...")  # Debug: Writing header for first time
            writer.writeheader()

        # Loop through the movies to scrape and save data starting from the checkpoint
        for movie_id in movies_id[start_index:]:
            print(f"Scraping data for movie ID {movie_id}...")  # Debug: Check current movie being processed

            # Scrape the movie info using the ImdbScraper
            movie_info = scraper.get_imdb_infos(movie_id)

            # Check if data is available and write it to the CSV file
            if movie_info:
                print(f"Writing data for movie ID {movie_id}...")  # Debug: Data being written
                writer.writerow(movie_info)
                update_checkpoint(movie_id)  # Update the checkpoint after processing each movie
            else:
                print(f"Failed to scrape data for movie ID {movie_id}")  # Debug: Failed to scrape data

    print(f"Movie data saved to {csv_file_path}.")
    scraper.close()

scrape_and_save_movie_data(movies)

Starting from the beginning.
File exists: False
Writing header to file...
Scraping data for movie ID 4227...
Writing data for movie ID 4227...
Scraping data for movie ID 8481...
Writing data for movie ID 8481...
Scraping data for movie ID 31908...
Writing data for movie ID 31908...
Scraping data for movie ID 41881...


KeyboardInterrupt: 

In [None]:
df = pd.read_csv('data/imdb_additional_movies_for_sven.csv')

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape[0]

1164

In [None]:
cleaned_file_path = 'data/imdb_additional_movies_data_half_1_cleaned.csv'
df.to_csv(cleaned_file_path, index=False)

In [None]:
df_cleaned = pd.read_csv('data/imdb_additional_movies_data_half_1_cleaned.csv')

In [None]:
df_cleaned.shape[0]

1240