In [21]:
## GET POSTERS FOR MOVIES FROM POSTER LINKS
import os
import pandas as pd
import requests
from tqdm import tqdm

# Load your movie CSV
df = pd.read_csv("data/sample_100_movies.csv")

# Create posters directory if it doesn't exist
poster_dir = "data/posters"
os.makedirs(poster_dir, exist_ok=True)

# Set user agent as required by Wikimedia
headers = {
    "User-Agent": "BollywoodThemesProject/0.1 (github link; contact: contact_email)"
}

# Initialize list to track failures
failed_downloads = []

# Loop through each movie
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Downloading posters"):
    imdb_id = row["imdb_id"]
    poster_url = row["poster_path"]

    if pd.isna(poster_url) or poster_url == "N/A":
        failed_downloads.append(f"{imdb_id}: Missing or 'N/A' poster URL")
        continue

    try:
        response = requests.get(poster_url, headers=headers, timeout=10)
        response.raise_for_status()

        file_path = os.path.join(poster_dir, f"{imdb_id}.jpg")
        with open(file_path, "wb") as f:
            f.write(response.content)

    except Exception as e:
        failed_downloads.append(f"{imdb_id}: {e}")

# Write all failed downloads to a file
if failed_downloads:
    with open("../data/supplementary files/failed_poster_downloads.txt", "w") as f:
        for error in failed_downloads:
            f.write(error + "\n")

Downloading posters: 100%|███████████████████████████████████████████████████████████| 101/101 [00:27<00:00,  3.61it/s]


In [5]:
## GET REMAINING POSERS FORM WIKI LINKS
import pandas as pd
import requests
import wikipedia
import re
import os

# --- Step 1: Load IMDb → Wikipedia link map ---
df = pd.read_csv("../data/sample_100_movies.csv")
id_to_wiki = dict(zip(df["imdb_id"], df["wiki_link"]))

# --- Step 2: Load failure file ---
failure_file = "../data/supplementary files/failed_poster_downloads.txt"
with open(failure_file, "r", encoding="utf-8") as f:
    original_lines = [line.strip() for line in f if line.strip()]
    failed_ids = [line.split(":")[0] for line in original_lines]

# --- Step 3: Prepare output folder ---
poster_dir = "../data/posters"
os.makedirs(poster_dir, exist_ok=True)

def get_first_image_url(wiki_url):
    """
    Return the first valid Wikipedia poster image URL without BeautifulSoup.
    """
    try:
        title = wiki_url.split("/wiki/")[-1]
        page_url = f"https://en.wikipedia.org/wiki/{title}"
        res = requests.get(page_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        if res.status_code != 200:
            return None

        # Use regex to find all image tags
        matches = re.findall(r'<img[^>]+src="([^"]+)"[^>]*>', res.text)
        for img in matches:
            if "thumb" in img and img.endswith((".jpg", ".jpeg", ".png")):
                if img.startswith("//"):
                    return "https:" + img
                elif img.startswith("/"):
                    return "https://en.wikipedia.org" + img
                else:
                    return img
        return None
    except Exception:
        return None

# --- Step 4: Attempt to download images ---
successful_ids = []

for imdb_id in failed_ids:
    wiki_url = id_to_wiki.get(imdb_id)
    if not wiki_url:
        print(f"No wiki link for {imdb_id}")
        continue

    img_url = get_first_image_url(wiki_url)
    if not img_url:
        print(f"❌ No image found for {imdb_id}")
        continue

    try:
        img_data = requests.get(img_url, headers={"User-Agent": "Mozilla/5.0"}).content
        with open(f"{poster_dir}/{imdb_id}.jpg", "wb") as f:
            f.write(img_data)
        successful_ids.append(imdb_id)
        print(f"✅ Recovered and saved poster for {imdb_id}")
    except Exception as e:
        print(f"❌ Download failed for {imdb_id}: {e}")

# --- Step 5: Rewrite the failure file with only unsuccessful records ---
remaining_lines = [line for line in original_lines if line.split(":")[0] not in successful_ids]

with open(failure_file, "w", encoding="utf-8") as f:
    for line in remaining_lines:
        f.write(line + "\n")

print(f"\n🧹 Cleaned failure file. {len(successful_ids)} posters recovered and removed.")

✅ Recovered and saved poster for tt7743400
✅ Recovered and saved poster for tt3893476
✅ Recovered and saved poster for tt4853926
✅ Recovered and saved poster for tt3017412
❌ No image found for tt1353093
✅ Recovered and saved poster for tt9558612
✅ Recovered and saved poster for tt5668770

🧹 Cleaned failure file. 6 posters recovered and removed.


In [8]:
## GET SUBTITLES FOR MOVIES
import requests
import json
import pandas as pd
import os
import time
from tqdm import tqdm

# Define variables with API key and login information to send OpenSubtitles API a login request and get a token
API_KEY = "my_opensubtitles_api_key"
USERNAME = "my_opensubtitles_username"
PASSWORD = "my_opensubtitles_password"

headers = {
    "Api-Key": API_KEY,
    "Content-Type": "application/json",
    "User-Agent": "BollywoodThemesProject v0.1"
}

data = {
    "username": USERNAME,
    "password": PASSWORD
}

response = requests.post("https://api.opensubtitles.com/api/v1/login", headers=headers, json=data)

if response.status_code == 200:
    token = response.json()["token"]
    print("New token:", token)

    # Save token to file for now, until the task is done
    with open("opensubtitles_token.txt", "w") as f:
        f.write(token)
else:
    print("Login failed:", response.status_code, response.text)

✅ New token: eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiI2RXFqVDRTME14STllRnJxQzI5UE9HWVJ2R2lrTDUwMiIsImV4cCI6MTc1MzM0MDgzNX0.Dj5YzDqIkJSP54SwGNvoU-qm9V1vnv9gYgBcezUwgq0


In [35]:
# Your API values
API_KEY = "my_opensubtitles_api_key"
TOKEN = "my_opensubtitles_token"
USER_AGENT = "BollywoodThemesProject v0.1" # user agent created for this project

# Create folder to save subtitle files, if doesn't exist already
output_folder = "../data/subtitles/srt"
os.makedirs(output_folder, exist_ok=True)

# Load your CSV
df = pd.read_csv("../data/sample_100_movies.csv")
imdb_ids = df["imdb_id"].dropna().unique().tolist()

# Reusable header
HEADERS = {
    "Api-Key": API_KEY,
    "Authorization": f"Bearer {TOKEN}",
    "User-Agent": USER_AGENT,
    "Content-Type": "application/json"
}

In [13]:
# --- Helper function with retry ---
def safe_request(func, *args, **kwargs):
    try:
        response = func(*args, **kwargs)
        response.raise_for_status()
        return response
    except Exception as e:
        print(f"{e}")
        return None

# --- Download subtitle ---
def download_subtitle(imdb_id):
    numeric_id = imdb_id.replace("tt", "")
    
    # Step 1: Search subtitles
    search_url = "https://api.opensubtitles.com/api/v1/subtitles"
    search_params = {
        "imdb_id": numeric_id,
        "languages": "en",
        "order_by": "downloads",
        "order_direction": "desc"
    }
    response = safe_request(requests.get, search_url, headers=HEADERS, params=search_params)
    if not response or not response.json().get("data"):
        print(f"No subtitles found for {imdb_id}")
        return
    
    try:
        file_id = response.json()["data"][0]["attributes"]["files"][0]["file_id"]
    except Exception:
        print(f"file_id missing for {imdb_id}")
        return
    
    # Step 2: Get download link
    download_url = "https://api.opensubtitles.com/api/v1/download"
    download_payload = { "file_id": file_id }
    download_response = safe_request(requests.post, download_url, headers=HEADERS, json=download_payload)
    if not download_response or not download_response.json().get("link"):
        print(f"Download link failed for {imdb_id}")
        return
    
    subtitle_url = download_response.json()["link"]

    # Step 3: Download subtitle file
    subtitle_file = safe_request(requests.get, subtitle_url)
    if not subtitle_file:
        print(f"Failed to download subtitle for {imdb_id}")
        return
    
    filepath = os.path.join(output_folder, f"{imdb_id}.srt")
    with open(filepath, "wb") as f:
        f.write(subtitle_file.content)
    
    print(f"Downloaded subtitle for {imdb_id}")

In [37]:
# --- Main Loop ---
for imdb_id in tqdm(imdb_ids, desc="Downloading subtitles"):
    download_subtitle(imdb_id)
    time.sleep(10)  # Respect API rate limit

Downloading subtitles:   0%|                                                                    | 0/23 [00:00<?, ?it/s]

❌ No subtitles found for tt2554042


Downloading subtitles:   4%|██▌                                                         | 1/23 [00:10<03:56, 10.76s/it]

❌ No subtitles found for tt7260848


Downloading subtitles:   9%|█████▏                                                      | 2/23 [00:21<03:42, 10.58s/it]

❌ No subtitles found for tt6040012


Downloading subtitles:  13%|███████▊                                                    | 3/23 [00:32<03:34, 10.71s/it]

❌ No subtitles found for tt7743400


Downloading subtitles:  17%|██████████▍                                                 | 4/23 [00:42<03:23, 10.71s/it]

❌ No subtitles found for tt3893476


Downloading subtitles:  22%|█████████████                                               | 5/23 [00:53<03:11, 10.62s/it]

❌ No subtitles found for tt2294685


Downloading subtitles:  26%|███████████████▋                                            | 6/23 [01:03<02:59, 10.57s/it]

❌ No subtitles found for tt6862542


Downloading subtitles:  30%|██████████████████▎                                         | 7/23 [01:14<02:48, 10.55s/it]

❌ No subtitles found for tt4853926


Downloading subtitles:  35%|████████████████████▊                                       | 8/23 [01:24<02:37, 10.53s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt2387495


Downloading subtitles:  39%|███████████████████████▍                                    | 9/23 [01:35<02:26, 10.46s/it]

❌ No subtitles found for tt4493550


Downloading subtitles:  43%|█████████████████████████▋                                 | 10/23 [01:45<02:15, 10.45s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt4249442


Downloading subtitles:  48%|████████████████████████████▏                              | 11/23 [01:55<02:04, 10.36s/it]

❌ No subtitles found for tt3021244


Downloading subtitles:  52%|██████████████████████████████▊                            | 12/23 [02:06<01:54, 10.38s/it]

❌ No subtitles found for tt1942905


Downloading subtitles:  57%|█████████████████████████████████▎                         | 13/23 [02:16<01:43, 10.39s/it]

❌ No subtitles found for tt4335698


Downloading subtitles:  61%|███████████████████████████████████▉                       | 14/23 [02:26<01:33, 10.41s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt1353093


Downloading subtitles:  65%|██████████████████████████████████████▍                    | 15/23 [02:37<01:22, 10.36s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt6118134


Downloading subtitles:  70%|█████████████████████████████████████████                  | 16/23 [02:47<01:12, 10.30s/it]

❌ No subtitles found for tt4581032


Downloading subtitles:  74%|███████████████████████████████████████████▌               | 17/23 [02:57<01:02, 10.36s/it]

❌ No subtitles found for tt5933706


Downloading subtitles:  78%|██████████████████████████████████████████████▏            | 18/23 [03:08<00:51, 10.40s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt2404187


Downloading subtitles:  83%|████████████████████████████████████████████████▋          | 19/23 [03:18<00:41, 10.34s/it]

❌ No subtitles found for tt4995402


Downloading subtitles:  87%|███████████████████████████████████████████████████▎       | 20/23 [03:28<00:31, 10.38s/it]

❌ No subtitles found for tt5752374


Downloading subtitles:  91%|█████████████████████████████████████████████████████▊     | 21/23 [03:39<00:20, 10.41s/it]

❌ ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
❌ No subtitles found for tt2150177


Downloading subtitles:  96%|████████████████████████████████████████████████████████▍  | 22/23 [03:49<00:10, 10.33s/it]

❌ No subtitles found for tt4340180


Downloading subtitles: 100%|███████████████████████████████████████████████████████████| 23/23 [04:00<00:00, 10.44s/it]


In [7]:
## GET DESCRIPTIONS FOR MOVIES using Cinemagoer library which scrapes IMDb

import os
import pandas as pd
import imdb
ia = imdb.Cinemagoer()

# Load your CSV file
df = pd.read_csv("../data/sample_100_movies.csv")  # Update path if needed

# Extract the imdb_ids column as a list
imdb_ids = df['imdb_id'].dropna().unique().tolist()

# Define the output folder
output_folder = "../data/descriptions"

# Create the folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Tracking missing entries
missing_count = 0

# Loop through IMDb IDs
for imdb_id in imdb_ids:
    imdb_id_clean = imdb_id.replace("tt", "")  # Cinemagoer uses numeric ID only

    try:
        # Fetch movie data
        movie = ia.get_movie(imdb_id_clean, info=['synopsis', 'plot'])

        # Try to extract synopsis
        synopsis_list = movie.get('synopsis', [])

        if synopsis_list:
            synopsis = synopsis_list[0].split("::")[0].strip()
            file_name = f"{imdb_id}.txt"
        else:
            # Fallback to plot if no synopsis
            plot_list = movie.get('plot', [])
            if plot_list:
                synopsis = plot_list[0].split("::")[0].strip()
                file_name = f"{imdb_id}.txt"
            else:
                # Neither synopsis nor plot found
                synopsis = "No synopsis or plot available."
                file_name = f"FAILED-{imdb_id}.txt"
                missing_count += 1

    except Exception as e:
        # Error fetching data from IMDb
        synopsis = f"Error retrieving synopsis: {e}"
        file_name = f"FAILED-{imdb_id}.txt"
        missing_count += 1

    # Save synopsis or error message to file
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(synopsis)

# Summary
print(f"\n✅ Finished processing all movies.")
print(f"⚠️ Could not retrieve synopsis/plot for {missing_count} movie(s). Files named with 'FAILED-'.")


✅ Finished processing all movies.
⚠️ Could not retrieve synopsis/plot for 0 movie(s). Files named with 'FAILED-'.


In [5]:
pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (pyproject.toml): started
  Building wheel for wikipedia (pyproject.toml): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11785 sha256=af57682b6b78ffcb4816e7b05afbbf233c53df8033ecdf5594930f67fb73ec8e
  Stored in directory: c:\users\aashna unadkat\appdata\local\pip\cache\wheels\79\1d\c8\b64e19423cc5a2a339450ea5d145e7c8eb3d4aa2b150cde33b
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may nee

In [19]:
## GET DESCRIPTIONS FOR REMAINING MOVIES FROM WIKIPEDIA
import os
import pandas as pd
import requests
import re
import time

# Load the full movie dataset
df = pd.read_csv("../data/sample_100_movies.csv")

# Ensure output folder exists
output_folder = "../data/descriptions"
os.makedirs(output_folder, exist_ok=True)

# Plot-related section headers to search for
PLOT_KEYWORDS = ["plot", "synopsis", "plot summary", "plot synopsis"]

# Track success/failure
success_count = 0
failure_count = 0
failed_ids = []

def get_canonical_page_id(url):
    """Extract reliable page ID by resolving the canonical title using redirects."""
    title = url.strip().rsplit('/', 1)[-1]
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'redirects': 1
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    pages = data['query']['pages']
    page_id = next(iter(pages))
    return int(page_id) if page_id != "-1" else None

def get_page_content_by_id(page_id):
    """Fetch full plain-text content of a Wikipedia page by its ID."""
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'explaintext': True,
        'pageids': page_id
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    return data['query']['pages'][str(page_id)]['extract']

for _, row in df.iterrows():
    imdb_id = row.get("imdb_id")
    wiki_url = row.get("wiki_link")

    if pd.isna(imdb_id) or pd.isna(wiki_url):
        continue

    try:
        # Step 1: Get canonical Page ID
        page_id = get_canonical_page_id(wiki_url)
        if not page_id:
            print(f"⚠️ IMDb ID {imdb_id} — Wikipedia page ID not found.")
            failure_count += 1
            failed_ids.append(imdb_id)
            continue

        # Step 2: Get content using page ID
        content = get_page_content_by_id(page_id)

        # Step 3: Look for plot/synopsis section
        sections = re.split(r'\n==+ *(.+?) *==+\n', content)
        plot_text = None
        for i in range(1, len(sections), 2):
            title = sections[i].strip().lower()
            body = sections[i + 1].strip()
            if any(k in title for k in PLOT_KEYWORDS):
                plot_text = body
                break

        if plot_text:
            file_path = os.path.join(output_folder, f"{imdb_id}.txt")
            mode = "a" if os.path.exists(file_path) else "w"
            with open(file_path, mode, encoding="utf-8") as f:
                f.write("\n\n[Wikipedia Plot Synopsis]\n")
                f.write(plot_text)
            success_count += 1
        else:
            print(f"⚠️ IMDb ID {imdb_id} — Plot section not found.")
            failure_count += 1
            failed_ids.append(imdb_id)

        time.sleep(0.5)  # Be kind to Wikipedia's servers

    except Exception as e:
        print(f"❌ IMDb ID {imdb_id} — Error: {e}")
        failure_count += 1
        failed_ids.append(imdb_id)

# Summary
print(f"\n✅ Wikipedia extraction complete.")
print(f"🟢 Successfully retrieved plot for {success_count} movies.")
print(f"🔴 Failed or missing for {failure_count} movies.")

# Save failed IDs
if failed_ids:
    with open("../data/supplementary files/failed_wikipedia_plots.txt", "w") as f:
        for fid in failed_ids:
            f.write(fid + "\n")

⚠️ IMDb ID tt3893476 — Plot section not found.
⚠️ IMDb ID tt6862542 — Plot section not found.
⚠️ IMDb ID tt5615116 — Plot section not found.
⚠️ IMDb ID tt4744086 — Wikipedia page ID not found.
⚠️ IMDb ID tt7260848 — Plot section not found.
⚠️ IMDb ID tt3017412 — Plot section not found.
⚠️ IMDb ID tt4249442 — Plot section not found.
⚠️ IMDb ID tt4335698 — Plot section not found.
⚠️ IMDb ID tt1353093 — Wikipedia page ID not found.
⚠️ IMDb ID tt8484942 — Wikipedia page ID not found.
⚠️ IMDb ID tt6118134 — Plot section not found.
⚠️ IMDb ID tt4340180 — Plot section not found.

✅ Wikipedia extraction complete.
🟢 Successfully retrieved plot for 89 movies.
🔴 Failed or missing for 12 movies.
