In [None]:
## DIRECTOR INFORMATION
import pandas as pd
import requests
import time

# Load filtered movie dataset
df_movies = pd.read_csv("../data/sample_100_movies.csv")

In [None]:
# First add director names to dataset using OMDb API and IMDb ID.

OMDB_API_KEY = "my_omdb_api_key"

# Load your movie CSV (update path as needed)
df = pd.read_csv("../data/sample_100_movies.csv")

# Initialize new columns
df["director_1_name"] = ""
df["director_2_name"] = ""

# Function to get director names from OMDb
def get_director_names(imdb_id):
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={OMDB_API_KEY}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        directors = data.get("Director", "")
        if directors == "N/A":
            return []
        return [d.strip() for d in directors.split(",")]
    except Exception as e:
        print(f"❌ Error fetching director for {imdb_id}: {e}")
        return []

# Loop through rows to fetch and populate names
for idx, row in df.iterrows():
    imdb_id = row["imdb_id"]
    director_names = get_director_names(imdb_id)
    
    if len(director_names) > 0:
        df.at[idx, "director_1_name"] = director_names[0]
    if len(director_names) > 1:
        df.at[idx, "director_2_name"] = director_names[1]
    
    time.sleep(1)  # Be polite to the API

# Save updated file
df.to_csv("../data/sample_100_movies.csv", index=False)
print("Director names added and saved.")

In [None]:
## Try to infer gender from Wikipedia page of the director.

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

def get_gender_from_wikipedia(name):
    try:
        # Wikipedia search
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": name,
            "format": "json"
        }
        res = requests.get(search_url, params=params)
        res.raise_for_status()
        results = res.json().get("query", {}).get("search", [])
        if not results:
            return "Unknown"

        # Get Wikipedia page
        page_title = results[0]["title"]
        page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
        soup = BeautifulSoup(requests.get(page_url).text, "html.parser")

        # First non-empty paragraph
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            if p.text.strip():
                first_para = p.text.lower()
                break
        else:
            return "Unknown"

        # Count pronouns
        male_count = sum(first_para.count(w) for w in [" he ", " his ", " him "])
        female_count = sum(first_para.count(w) for w in [" she ", " her ", " hers "])

        if male_count > female_count:
            return "Male"
        elif female_count > male_count:
            return "Female"
        else:
            return "Unknown"

    except:
        return "Unknown"

# Load your data
df = pd.read_csv("../data/sample_100_movies.csv")  # change to your file path
df["director_1_gender"] = ""
df["director_2_gender"] = ""

# Loop through rows
for idx, row in df.iterrows():
    name1 = row.get("director_1_name")
    name2 = row.get("director_2_name")

    if pd.notna(name1):
        df.at[idx, "director_1_gender"] = get_gender_from_wikipedia(name1)
        time.sleep(1.5)

    if pd.notna(name2):
        df.at[idx, "director_2_gender"] = get_gender_from_wikipedia(name2)
        time.sleep(1.5)

# Count "Unknown" entries
unknown_count = (
    (df["director_1_gender"] == "Unknown").sum() +
    (df["director_2_gender"] == "Unknown").sum()
)

# Save to file
df.to_csv("../data/supplementary files/sample_100_movies_wikipedia_gender.csv", index=False)

# Final output
print(f"Gender inference complete. {unknown_count} director(s) could not be classified.")

In [None]:
## Try to infer gender from dictionary of Indian names
## Dictionary found on: https://www.kaggle.com/datasets/shubhamuttam/indian-names-by-gender

import pandas as pd
import os

# Load main dataset and dictionary. Dictionary of Indian names sourced from Kaggle.
movies_df = pd.read_csv("../data/supplementary files/sample_100_movies_wikipedia_gender.csv")
name_gender_df = pd.read_csv("../data/supplementary files/Gender_Data.csv")
name_gender_df['Name'] = name_gender_df['Name'].str.lower()

# Gender code → label
gender_map = {0: 'Male', 1: 'Female'}

# Dictionary: first name → gender(s)
name_to_gender = (
    name_gender_df.groupby('Name')['Gender']
    .apply(lambda g: set(gender_map.get(x, x) for x in g))
    .to_dict()
)

# Core gender inference function
def infer_gender(name):
    if pd.isna(name) or not isinstance(name, str):
        return 'Unknown'
    first_token = name.split()[0].lower()
    genders = name_to_gender.get(first_token)
    if genders is None:
        return 'Unknown'
    if len(genders) == 1:
        return list(genders)[0]
    return 'Ambiguous'

# Wrapper for director_2: return "" if name is missing
def infer_gender_optional(name):
    if pd.isna(name) or not isinstance(name, str) or name.strip() == "":
        return ""
    return infer_gender(name)

# Apply inference
movies_df['director_1_gender'] = movies_df['director_1_name'].apply(infer_gender)
movies_df['director_2_gender'] = movies_df['director_2_name'].apply(infer_gender_optional)

# Save to 'data/' folder
output_path = os.path.join("../data/supplementary files/", "sample_100_movies_with_dictionary_gender.csv")
movies_df.to_csv(output_path, index=False)

print(f"File saved to: {output_path}")

In [None]:
## Upon checking both, it seemed like Wikipedia was able to do an okay job, and the dictionary just needed to fill in some gaps.
## At the end of this merge, 13 records remained with an Unknown gender between both director columns.

import pandas as pd

# Load both files
original_df = pd.read_csv("../data/supplementary files/sample_100_movies_with_wikipedia_gender.csv")
new_df = pd.read_csv("../data/supplementary files/sample_100_movies_with_dictionary_gender.csv")

# Safety check: ensure matching rows using imdb_id
original_df.set_index("imdb_id", inplace=True)
new_df.set_index("imdb_id", inplace=True)

# List of gender columns to update
gender_cols = ["director_1_gender", "director_2_gender"]

for col in gender_cols:
    original_values = original_df[col]
    new_values = new_df[col]

    # Update only where original is "Unknown" and new is "Male" or "Female"
    updated_values = original_values.where(
        ~((original_values == "Unknown") & (new_values.isin(["Male", "Female"]))),
        new_values
    )
    original_df[col] = updated_values

# Reset index and save
original_df.reset_index(inplace=True)
original_df.to_csv("../data/sample_100_movies.csv", index=False)

print("Gender columns updated in 'sample_100_movies.csv'")

In [None]:
## GET BOX OFFICE INFORMATION FROM WIKIPEDIA, IF AVAILABLE
import os
import pandas as pd
import requests
import re
import time

# Load movie data
df = pd.read_csv("../data/sample_100_movies.csv")

# Prepare output folder
os.makedirs("data", exist_ok=True)
output_rows = []

# Canonical Wikipedia page ID resolver
def get_canonical_page_id_and_url(wiki_url):
    title = wiki_url.strip().rsplit("/", 1)[-1]
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "redirects": 1
    }
    r = requests.get(api_url, params=params)
    data = r.json()
    pages = data["query"]["pages"]
    page_id = next(iter(pages))
    if page_id == "-1":
        return None, None
    canonical_title = pages[page_id]["title"].replace(" ", "_")
    permalink = f"https://en.wikipedia.org/wiki/{canonical_title}"
    return int(page_id), permalink

# Extract box office from HTML using regex
def extract_box_office_from_html(html):
    # Limit to infobox section only
    infobox_match = re.search(r'(<table class="infobox[^>]*>.*?</table>)', html, re.DOTALL)
    if not infobox_match:
        return None

    infobox_html = infobox_match.group(1)

    # Find Box office row
    match = re.search(r'<th[^>]*>\s*Box office\s*</th>\s*<td[^>]*>(.*?)</td>', infobox_html, re.DOTALL | re.IGNORECASE)
    if match:
        # Clean value: remove HTML tags, references
        raw_text = match.group(1)
        cleaned = re.sub(r'<.*?>|\[.*?\]', '', raw_text).strip()
        return cleaned
    return None

success, failure = 0, 0

for _, row in df.iterrows():
    imdb_id = row.get("imdb_id")
    wiki_url = row.get("wiki_link")

    if pd.isna(imdb_id) or pd.isna(wiki_url):
        continue

    try:
        # Step 1: Resolve to canonical page
        page_id, permalink = get_canonical_page_id_and_url(wiki_url)
        if not page_id:
            print(f"IMDb ID {imdb_id}: Page not found")
            failure += 1
            continue

        # Step 2: Fetch HTML content
        html = requests.get(permalink).text

        # Step 3: Extract Box Office info
        box_office = extract_box_office_from_html(html)

        output_rows.append({
            "imdb_id": imdb_id,
            "wiki_url": permalink,
            "box_office": box_office
        })

        if box_office:
            print(f"{imdb_id} — {box_office}")
            success += 1
        else:
            print(f"{imdb_id} — Box office not found")
            failure += 1

        time.sleep(0.5)

    except Exception as e:
        print(f"{imdb_id}: {e}")
        failure += 1

# Save output
output_df = pd.DataFrame(output_rows)
output_df.to_csv("../data/supplementary files/box_office_from_wikipedia.csv", index=False)
print(f"\nDone! Success: {success}, Failures: {failure}")
print("Saved: box_office_from_wikipedia.csv")


In [None]:
## CLEAN UP BOX OFFICE INFORMATION

import pandas as pd
import re
import html

# Load raw CSV or series
df = pd.read_csv("../data/supplementary files/box_office_from_wikipedia.csv")  # or replace with your DataFrame

def normalize_box_office(value):
    if not isinstance(value, str) or not value.strip():
        return None

    # Unescape HTML entities and lowercase
    text = html.unescape(value.lower())

    # Remove text in parentheses and after "equivalent to"
    text = re.split(r'\(.*?\)|equivalent to', text)[0]

    # Remove non-breaking spaces, commas, labels
    text = text.replace("nbsp;", "").replace(",", "")
    text = re.sub(r"(est\.?|crores?)", "", text)

    # Extract amount + unit
    match = re.search(r'₹\s?([\d\.]+)\s*(crore|million|billion|lakh)?', text)
    if not match:
        return None

    amount = float(match.group(1))
    unit = match.group(2)

    # Convert to crore
    if unit == "million":
        amount = amount * 0.1  # 1 million = 0.1 crore
    elif unit == "billion":
        amount = amount * 100  # 1 billion = 100 crore
    elif unit == "lakh":
        amount = amount * 0.01  # 1 lakh = 0.01 crore
    # if unit is already crore or missing, leave as is

    return round(amount, 2)

# Apply to box_office column
df["box_office_cleaned_inr_crore"] = df["box_office"].apply(normalize_box_office)

# Save cleaned file
df.to_csv("../data/supplementary files/box_office_cleaned.csv", index=False)
print("Cleaned values saved to: box_office_cleaned.csv")

In [None]:
## ADD CLEANED UP BOX OFFICE DATA TO SAMPLE 100 MOVIES FILE
import pandas as pd
import os

# Define paths
data_folder = os.path.join("..", "data")
sample_path = os.path.join(data_folder, "sample_100_movies.csv")
box_office_path = os.path.join(data_folder, "supplementary files/box_office_cleaned.csv")

# Load the datasets
sample_df = pd.read_csv(sample_path)
box_office_df = pd.read_csv(box_office_path)

# Determine merge key
merge_key = "imdb_id" if "imdb_id" in sample_df.columns and "imdb_id" in box_office_df.columns else "title"

# Merge to add the column (in memory only)
sample_df = sample_df.merge(
    box_office_df[[merge_key, "box_office_cleaned_inr_crore"]],
    on=merge_key,
    how="left"
)

# Save in sample_100_movies.csv
sample_df.to_csv(sample_path, index=False)