In [1]:
# import dataset
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [2]:
#load dataset
df = pd.read_csv("result.csv")
df.head()

Unnamed: 0,year,rating,imdbid,id,title
0,1874,0,3155794,9602,Passage de Venus
1,1877,0,14495706,9804,La Rosace Magique
2,1878,0,2221420,9603,Sallie Gardner at a Gallop
3,1878,0,12592084,9806,Le singe musicien
4,1881,0,7816420,9816,Athlete Swinging a Pick


In [3]:
# format imdb IDs
def format_imdb_id(x):
    x = str(int(x)).zfill(7)
    return "tt" + x
df["imdb_tt"] = df["imdbid"].apply(format_imdb_id)
df[["imdbid", "imdb_tt"]].head()

Unnamed: 0,imdbid,imdb_tt
0,3155794,tt3155794
1,14495706,tt14495706
2,2221420,tt2221420
3,12592084,tt12592084
4,7816420,tt7816420


In [4]:
import json
import requests
from bs4 import BeautifulSoup

#scraping function
def scrape_imdb(imdb_tt):
    url = f"https://www.imdb.com/title/{imdb_tt}/"

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9"
    }

    try:
        r = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(r.text, "lxml")

        scripts = soup.find_all("script", type="application/ld+json")

        for script in scripts:
            data = json.loads(script.string)

            if data.get("@type") == "Movie":

                # extract genre
                genre = ""
                if "genre" in data:
                    if isinstance(data["genre"], list):
                        genre = ", ".join(data["genre"])
                    else:
                        genre = data["genre"]

                # extract description
                description = data.get("description", "")

                # extract imdb rating
                rating = ""
                if "aggregateRating" in data:
                    rating = data["aggregateRating"].get("ratingValue", "")

                return genre, description, rating

        return "", "", ""

    except:
        return "", "", ""

In [5]:
# scrape website data for each movie
genres = []
descriptions = []
imdb_ratings = []
for tt in tqdm(df["imdb_tt"][:10664]):
    g, d, r = scrape_imdb(tt)
    genres.append(g)
    descriptions.append(d)
    imdb_ratings.append(r)
    time.sleep(1)

100%|███████████████████████████████████| 10664/10664 [9:51:06<00:00,  3.33s/it]


In [6]:
# save scraped data back to dataframe
df.loc[:10663, "genre"] = genres
df.loc[:10663, "description"] = descriptions
df.loc[:10663, "imdb_rating"] = imdb_ratings

# create beckdel pass column
df["bechdel_pass"] = df["rating"].apply(lambda x: "Yes" if x == 3 else "No")

In [7]:
#export to new CSV file
df.to_csv("beckdel_allinfo_final.csv", index=False)
print("Saved as bechdel_allinfo_final.csv")

Saved as bechdel_allinfo_final.csv
