In [1]:
import pandas as pd

import re
import requests
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("script_ref.csv", index_col=0)
df = df[df["script_downloaded"] == "Yes"]
df.reset_index(drop=True, inplace=True)

In [3]:
OMDB_columns = ["OMDB_Title", "Rating", "Release", "Runtime", "Genre", "Director", "Writer", 
                  "Language", "Country", "Noms", "IMDB", "IMDB_Votes", "Rotten_Tomatoes",
                  "Metascore", "Boxoffice"]

In [4]:
OMDB = pd.DataFrame(index=df["title"], columns=OMDB_columns)

In [21]:
for i in tqdm(range(950, len(OMDB)), desc="Processing"):

    title = df["title"][i]
    if ", The" in title:
        parts = title.split(", The")
        title = f"The {parts[0].rsplit(', ', 1)[-1]}"
    # Creating the API request url
    base_url = "http://www.omdbapi.com/"
    movie_id = "?t=" + title.replace(" ", "+")
    apikey = "&apikey=" + #API Key
    url = base_url + movie_id + apikey

    # Requesting the data and making a json file
    response = requests.get(url)
    data = response.json() 

    #Populating the dataframe with the appropriate values
    if data["Response"] == "True":
        OMDB["OMDB_Title"][i] = data["Title"]
        OMDB["Rating"][i] = data["Rated"]
        OMDB["Release"][i] = data["Released"]
        OMDB["Runtime"][i] = data["Runtime"]
        OMDB["Genre"][i] = data["Genre"]
        OMDB["Director"][i] = data["Director"]
        OMDB["Writer"][i] = data["Writer"]
        OMDB["Language"][i] = data["Language"]
        OMDB["Country"][i] = data["Country"]
        OMDB["Noms"][i] = data["Awards"]
        OMDB["IMDB"][i] = data["imdbRating"]
        OMDB["IMDB_Votes"][i] = data["imdbVotes"]
        OMDB["Rotten_Tomatoes"][i] = data["Ratings"][1]["Value"] if len(data["Ratings"]) >= 2 else None
        OMDB["Metascore"][i] = data["Metascore"]
        OMDB["Boxoffice"][i] = data["BoxOffice"] if "BoxOffice" in data else None

Processing:   0%|          | 0/196 [00:00<?, ?it/s]

In [24]:
OMDB.to_csv("OMDB_raw.csv")

Ensure We Have Right Movie

In [108]:
scripts = pd.read_csv("script_ref.csv", index_col=0)
OMDB = pd.read_csv("OMDB_raw.csv")

In [109]:
scripts = scripts[scripts["script_downloaded"] == "Yes"]
scripts = scripts[["title", "writer"]]

In [110]:
OMDB = OMDB[OMDB["OMDB_Title"].isna() == False]
OMDB = OMDB[["title", "Writer"]]

In [111]:
write = scripts.merge(OMDB, on="title")

In [112]:
write["writer"] = write["writer"].str.replace("Written by ", "")
write.columns = {"title", "scrape_writer", "OMDB_writer"}
write["scrape_writer"] = write["scrape_writer"].str.replace(",", " ")
write["OMDB_writer"] = write["OMDB_writer"].str.replace(",", "")

In [113]:
def count_common_words(row):
    scrape_writers = set(str(row['scrape_writer']).split()) 
    OMDB_writers = set(str(row['OMDB_writer']).split())  
    return len(scrape_writers.intersection(OMDB_writers))

write['common_words_count'] = write.apply(count_common_words, axis=1)

In [114]:
keep = write[write["common_words_count"] > 0]

In [115]:
keep = keep[~keep['title'].isin(["All the King's Men", "Clash of the Titans", "Hard to Kill", "Program, The", "Star Wars: The Force Awakens", ])]

In [118]:
OMDB = pd.read_csv("OMDB_raw.csv")
movie_data = OMDB[OMDB["title"].isin(keep['title'])].reset_index(drop=True)

Cleaning Up Movie Data

In [119]:
movie_data['Release'] = pd.to_datetime(movie_data['Release'], format='%d %b %Y')

movie_data['Day_of_Year'] = movie_data['Release'].dt.dayofyear
movie_data['Week_of_Year'] = movie_data['Release'].dt.isocalendar().week
movie_data['Season'] = (movie_data['Release'].dt.month % 12 + 3) // 3
movie_data['Month'] = movie_data['Release'].dt.month
movie_data['Year'] = movie_data['Release'].dt.year

movie_data = movie_data.drop(columns="Release")

In [120]:
movie_data['Runtime'] = movie_data['Runtime'].str.extract('(\d+)').fillna(0).astype(int)

In [121]:
# Feature engenieering the total wins and total nominations data
movie_data['Total_Wins'] = movie_data['Noms'].str.extract('(\d+) wins?').astype(float)
movie_data['Total_Noms'] = movie_data['Noms'].str.extract('(\d+) nominations?').astype(float)

# Cleaning different website rating values
movie_data['IMDB'] = movie_data['IMDB'].astype(float)
movie_data['IMDB_Votes'] = movie_data['IMDB_Votes'].str.replace(',', '').astype(float)
movie_data.loc[~movie_data['Rotten_Tomatoes'].str.contains('%', na=False), 'Rotten_Tomatoes'] = 0
movie_data['Rotten_Tomatoes'] = movie_data['Rotten_Tomatoes'].str.replace('%', '').astype(float)
movie_data['Metascore'] = movie_data['Metascore'].astype(float)
movie_data['Boxoffice'] = movie_data['Boxoffice'].str.replace(',', '').str.replace('$', '').astype(float)

movie_data = movie_data.drop(columns="Noms")

  movie_data['Boxoffice'] = movie_data['Boxoffice'].str.replace(',', '').str.replace('$', '').astype(float)


In [122]:
movie_data.to_csv("movie_data_no_dummies.csv")

In [123]:
# Making dummy variables for Parent Rating
movie_data['Rating'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Rating=movie_data['Rating'].str.split(', ')).explode('Rating')
Rating_indicators = pd.get_dummies(movie_data['Rating'], prefix='Rating')
movie_data = pd.concat([movie_data, Rating_indicators], axis=1)
movie_data.drop(columns=['Rating'], inplace=True)
Rating_columns = movie_data.columns[movie_data.columns.str.startswith('Rating')]
movie_data[Rating_columns] = movie_data.groupby(movie_data.index)[Rating_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

# Making dummy variables for Genres
movie_data['Genre'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Genre=movie_data['Genre'].str.split(', ')).explode('Genre')
genre_indicators = pd.get_dummies(movie_data['Genre'], prefix='Genre')
movie_data = pd.concat([movie_data, genre_indicators], axis=1)
movie_data.drop(columns=['Genre'], inplace=True)
genre_columns = movie_data.columns[movie_data.columns.str.startswith('Genre')]
movie_data[genre_columns] = movie_data.groupby(movie_data.index)[genre_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

# Making dummy variables for Directors
movie_data['Director'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Director=movie_data['Director'].str.split(', ')).explode('Director')
Director_indicators = pd.get_dummies(movie_data['Director'], prefix='Director')
movie_data = pd.concat([movie_data, Director_indicators], axis=1)
movie_data.drop(columns=['Director'], inplace=True)
Director_columns = movie_data.columns[movie_data.columns.str.startswith('Director')]
movie_data[Director_columns] = movie_data.groupby(movie_data.index)[Director_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

# Making dummy variables for Writers
movie_data['Writer'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Writer=movie_data['Writer'].str.split(', ')).explode('Writer')
Writer_indicators = pd.get_dummies(movie_data['Writer'], prefix='Writer')
movie_data = pd.concat([movie_data, Writer_indicators], axis=1)
movie_data.drop(columns=['Writer'], inplace=True)
Writer_columns = movie_data.columns[movie_data.columns.str.startswith('Writer')]
movie_data[Writer_columns] = movie_data.groupby(movie_data.index)[Writer_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

# Making dummy variables for Languages
movie_data['Language'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Language=movie_data['Language'].str.split(', ')).explode('Language')
Language_indicators = pd.get_dummies(movie_data['Language'], prefix='Language')
movie_data = pd.concat([movie_data, Language_indicators], axis=1)
movie_data.drop(columns=['Language'], inplace=True)
Language_columns = movie_data.columns[movie_data.columns.str.startswith('Language')]
movie_data[Language_columns] = movie_data.groupby(movie_data.index)[Language_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

# Making dummy variables for Countries
movie_data['Country'].fillna('Unknown', inplace=True)
movie_data = movie_data.assign(Country=movie_data['Country'].str.split(', ')).explode('Country')
Country_indicators = pd.get_dummies(movie_data['Country'], prefix='Country')
movie_data = pd.concat([movie_data, Country_indicators], axis=1)
movie_data.drop(columns=['Country'], inplace=True)
Country_columns = movie_data.columns[movie_data.columns.str.startswith('Country')]
movie_data[Country_columns] = movie_data.groupby(movie_data.index)[Country_columns].transform('sum')
movie_data.drop_duplicates(inplace=True)

In [124]:
movie_data.to_csv("movie_data_dummies.csv")