# 0. Import Data and Dependencies

## 0.0 Dependencies

In [1]:
import pandas as pd
import numpy as np
import os
import gdown
import zipfile
import shutil
import re

## 0.1 Extracting and Downloading Data

Paths

In [2]:
gdrive_url = 'https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2'
output_path = 'archive.zip'
output_folder_path = 'archive'
extract_to_folder = "c:\\temp_extract"

Extraction Functions

In [6]:
# function for extracting a zip file stored at a url

def extract_zip(zip_file_path, extract_to):
    # Extract the zip file if it exists and is valid 
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted all files to {extract_to}")
    except FileNotFoundError:
        print(f"Error: File {zip_file_path} does not exist.")
    except zipfile.BadZipFile:
        print("Error: Not a valid zip file.")
    except Exception as e:
        print(f"An error occured: {e}")
    

# function for downloading and extracting zip folder from gdrive link
def download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path):
    # Download the file
    try: 
        gdown.download(gdrive_url, output_path, quiet=False)
        print(f'File downloaded and saved as {output_path}')
        # Extract the downloaded zip file
        extract_zip(output_path, extract_to_folder)
        # Move extracted folder and clean up
        shutil.move(extract_to_folder, output_folder_path)
        os.remove(output_path)
        print(f"Moved extracted files to {output_folder_path} and removed {output_path}")
    except Exception as e:
        print(f"An error occurred during the download or extraction process: {e}")

download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path)

Downloading...
From (original): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2
From (redirected): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2&confirm=t&uuid=4102c6f3-c50c-4a77-933a-98500c6488a8
To: c:\Users\bened\DataScience\ANLP\AT2\36118_NLP_Spring\notebooks\archive.zip


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

File downloaded and saved as archive.zip


  3%|▎         | 18.4M/689M [1:17:37<47:18:35, 3.94kB/s]
 30%|██▉       | 206M/689M [1:17:13<3:01:08, 44.5kB/s]


Extracted all files to c:\temp_extract
Moved extracted files to archive and removed archive.zip


In [8]:
# Define path for screenplay data
texts_path = os.path.join(output_folder_path, "screenplay_data", "data", "raw_texts", "raw_texts")

# Intialize a dictionary for storing file names and contents
screenplays = {}

# List and iterate over all files in the folder
for file_name in os.listdir(texts_path):
    file_path = os.path.join(texts_path, file_name)

    # Ensure the path is an actual file before reading
    if os.path.isfile(file_path):
        try:
            # Read and store file contents
            with open(file_path, 'r', encoding='utf-8') as f:
                screenplays[file_name] = f.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Print a sample of first ten files to check files have been parsed
for i, (file_name, content) in enumerate(screenplays.items()):
    if i == 10:
        break
    print(f"Example of {file_name}:\n")
    print(content[:100])
    print("-" * 50)

Error reading A Hard Days Night_0581729.txt: 'utf-8' codec can't decode byte 0xe9 in position 62670: invalid continuation byte
Error reading An American Werewolf in Paris_0118604.txt: 'utf-8' codec can't decode byte 0xe9 in position 47384: invalid continuation byte
Error reading At First Sight_0132512.txt: 'utf-8' codec can't decode byte 0x85 in position 2467: invalid start byte
Error reading Batman Robin_0118688.txt: 'utf-8' codec can't decode byte 0x92 in position 5242: invalid start byte
Error reading Body Bags_0106449.txt: 'utf-8' codec can't decode byte 0xa9 in position 45: invalid start byte
Error reading Boy Who Never Slept_1781782.txt: 'utf-8' codec can't decode byte 0x85 in position 37957: invalid start byte
Error reading Brazil_0088846.txt: 'utf-8' codec can't decode byte 0xa3 in position 30758: invalid start byte
Error reading Erik the Viking_0097289.txt: 'utf-8' codec can't decode byte 0xe9 in position 157568: invalid continuation byte
Error reading Exorcist The Beginning_0

Download movie metadata and read into pandas DataFrame.

In [6]:
# Set display option for visibility
pd.set_option('display.max_columns', 25)

# Define path to the metadata CSV file
csv_path = os.path.join(output_folder_path, 'movie_metadata', 'movie_meta_data.csv')

# Ensure path exists before trying to read 
if os.path.exists(csv_path):
    # Read the CSV file into pandas datastructure
    meta_df = pd.read_csv(csv_path)
    # Print column names
    print(meta_df.columns)
    # Display first few rows of dataframe
    meta_df.head()
else:
    print(f"File {csv_path} does not exist.")

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis'],
      dtype='object')
   imdbid                   title  \
0  120770  A Night at the Roxbury   
1  132512          At First Sight   
2  118661            The Avengers   
3  215545              Bamboozled   
4  118715        The Big Lebowski   

                                                akas  year  metascore  \
0  Une nuit au Roxbury (France), Movida en el Rox...  1998         26   
1  Sight Unseen (United States), Premier regard (...  1999         40   
2  Chapeau melon et bottes de cuir (France), Mit ...  1998         12   
3  The Very Black Show (France), It's Showtime (G...  2000         54   
4  El gr

Columns relevant to us are:
- title
- age restrict
- year may be of relevance in examining changes in cultural norms over time. E.g. a certain curse word might get a movie an MA rating in the 1960s but not in the 2020s. 
- budget and opening weekend may be of relevance in examining impact of classification on the movie's net.
- imdbid may be of relevance for joining other data through the imdb database. 

In [7]:
print(screenplays[0][:100])

NameError: name 'screenplays' is not defined

## 0.2 Merging Data

Use filename patterns to distinguish titles from imdbids

In [10]:
# Extract movie titles and IDs
movie_titles, ids = zip(*[
    (f.split("_", 1)[0], f.split("_", 1)[1].split(".", 1)[0])
    for f in screenplays.keys()])

# Print first 10 titles and IDs
for i, (title, id) in enumerate(zip(movie_titles, ids)):
    if i == 10: 
        break
    print(f"Title: {title}   ID: {id}")

Title: 10 Cloverfield Lane   ID: 1179933
Title: 10 Things I Hate About You   ID: 0147800
Title: 101 Days of 101 Dalmatians   ID: 0249328
Title: 12 Angry Men   ID: 0118528
Title: 12 Monkeys   ID: 0114746
Title: 12 Years a Slave   ID: 2024544
Title: 127 Hours   ID: 1542344
Title: 13 13 13   ID: 2991516
Title: 1408   ID: 0450385
Title: 1492 Conquest of Paradise   ID: 0103594


Read screenplay text data into a Pandas DataFrame.

In [11]:
# Create a dataframe from screenplays
screenplays_df = pd.DataFrame({
    'imdbid': [
        os.path.splitext(f.split("_", 1)[1])[0].replace(".txt", "") for f in screenplays.keys()
        ],
    'screenplay': screenplays.values()
})
screenplays_df.head()

Unnamed: 0,imdbid,screenplay
0,1179933,The Cellar\n\nby\nJosh Campbell & Matt Stuecke...
1,147800,\n TEN THINGS I ...
2,249328,"107\n#2\n40] _DALMATIANS MARCH 17, 1995\n\nEX..."
3,118528,PLEASE COPY AND RETURN |\n\nâââ_âââ...
4,114746,\n\t\t\t\tTWELVE MONKEYS\n\t \n\t\t ...


Display data info for two dataframes.

In [9]:
print(meta_df.info())
print(screenplays_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   opening weekend            1739 non-null   object
 9   producers                  2640 non-null   object
 10  budget                     1624 non-null   object
 11  script department          2220 non-null   object
 12  production companies       2682 non-null   object
 13  writers                    2696 non-null   object
 14  director

Merge metadata with screenplay text data.

In [12]:
# merge with metadata on imdbid
if screenplays_df['imdbid'].dtype != 'int':
    screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)
    
df = meta_df.merge(screenplays_df, on='imdbid')
df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,budget,script department,...,directors,casting directors,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...","$17,000,000 (estimated)",,...,John Fortenberry,Jeff Greenberg,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler","$60,000,000 (estimated)",,...,Irwin Winkler,"Kerry Barden, Billy Hopkins, Suzanne Smith","Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub","$60,000,000 (estimated)","Sharon Mansfield, Anna Worley",...,Jeremiah S. Chechik,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron","$10,000,000 (estimated)","Shari L. Carpenter, Carolyn De Sousa",...,Spike Lee,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...","$15,000,000 (estimated)",T. Kukovinski,...,"Joel Coen, Ethan Coen",John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


# 1. Exploration and Simple Preprocessing

In [13]:
df.columns

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis',
       'screenplay'],
      dtype='object')

## 1.1 Countries

In [14]:
df['countries'].value_counts()

countries
United States                                                          1613
United Kingdom                                                           96
United Kingdom, United States                                            92
United States, United Kingdom                                            87
United States, Canada                                                    58
                                                                       ... 
United States, Germany, Italy, Spain, France, Japan, United Kingdom       1
Finland, Estonia, Germany, United Kingdom                                 1
United States, Canada, New Zealand                                        1
Canada, Germany                                                           1
United Kingdom, China, United States                                      1
Name: count, Length: 360, dtype: int64

Checking here whether we're dealing exclusively with screenplays written in English or not.

In [15]:
# find samples of screenplays in which countries !contain "United States", "United Kingdom", "Canada" 
non_anglo = df[df['countries'].str.contains("United States|United Kingdom|Canada|Australia") == False]
non_anglo

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,budget,script department,...,directors,casting directors,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,screenplay
42,5639354,A Fantastic Woman,"Eine fantastische Frau (Germany), A Fantastic ...",2017,86,7,22299,"Oscar 2018, Golden Globe 2018, Golden Boll in ...",,"Maren Ade, Alexander Bohr, Eduardo Castro, Mar...",,"Eliseo Altunaga, Guillermo Calderón, Antonia O...",...,Sebastián Lelio,"Alejandra Alaff, Moira Miller","Daniela Vega, Francisco Reyes, Luis Gnecco, Al...","Chile, Spain, Germany","Australia:M, Brazil:14, Canada:14A::(Alberta),...","Marina, a transgender woman who works as a wai...",Somewhere in Santiago at a dimly-lit nightclub...,"transgender-woman, female-nudity, death-of-lov...",Drama,"Fearless, Powerful, Ravishing.",,A Fantastic Woman\n\nScreenplay by\nSebastian ...
49,6896536,Foxtrot,"運命は踊る (Japan, Japanese title), 今天跳舞不打仗 (Taiwan...",2017,88,7,6533,"Asia Pacific Screen Award 2017, City of Athens...",,"Marc Baschet, Martina Valentina Baumgartner, D...",,,...,Samuel Maoz,"Tanja Schuh, Chamutal Zerem","Lior Ashkenazi, Sarah Adler, Yonaton Shiray, S...","Israel, Switzerland, Germany, France","Australia:MA15+, Canada:14A::(Alberta), Canada...",A troubled family must face the facts when som...,Michael and Dafna are devastated when Army off...,"foreign-occupation, jarhead, bare-breasts, fem...",Drama,,,FOXTROT\n\nA screenplay by Samuel Maoz\n1. IN...
50,5304464,Happy End,"Un final feliz (Mexico), ハッピーエンド (Japan, Japan...",2017,72,6,12970,"Palme d'Or 2017, European Film Award 2017, Mag...",,"Stefan Arndt, Christopher Granier-Deferre, Mic...",,Maggie Perlado,...,Michael Haneke,"David El Hakim, Kris Portier de Bellair","Isabelle Huppert, Jean-Louis Trintignant, Math...","France, Austria, Germany","Australia:M, Austria:16, Brazil:14, Canada:14A...",A well-to-do French family deals with a series...,"Gradually succumbing to dementia, George Laure...","self-hatred, 12-year-old, hairdresser, depress...",Drama,Don't believe the title of this Michael Haneke...,,âHAPPY ENDâ\n\nMichael Haneke\nHAPPY END ...
52,6304162,Loveless,"Нелюбовь (Russia), Faute d'amour (France), Lov...",2017,86,7,29211,"Oscar 2018, Golden Globe 2018, BAFTA Film Awar...",,"Pascal Caucheteux, Gleb Fetisov, Sergey Melkum...",,Konstantin Tishchenko,...,Andrey Zvyagintsev,,"Maryana Spivak, Aleksey Rozin, Matvey Novikov,...","Russia, France, Germany, Belgium","Argentina:16, Australia:MA15+, Brazil:14, Cana...",A couple going through a divorce must team up ...,"Still living under the same roof, the Moscow c...","missing-child, broken-family, divorce, written...",Drama,A Missing Child. A Marriage Destroyed. A Count...,Whether or not it is designed as an allegory o...,"LOVELESS\n\nOleg Negin, Andrey Zvyagintsev\nL..."
57,3741632,The Leisure Seeker,"Ella & John - The Leisure Seeker (Italy), L'éc...",2017,45,6,7285,"Golden Globe 2018, Capri Ensemble Cast Award 2...",Italy:,"Cobi Benatoff, Elisabetta Boni, Ferdinando Bon...",,"Ben Barker, Katherine Steets",...,Paolo Virzì,"Tara Feldstein, Chase Paris","Helen Mirren, Donald Sutherland, Christian McK...","Italy, France","Australia:M, Austria:10, Brazil:14, Canada:PG:...",A runaway couple go on an unforgettable journe...,A runaway couple go on an unforgettable journe...,"recreational-vehicle, road-trip, husband-wife-...","Adventure, Comedy, Drama, Romance",A Once in a Lifetime Roadtrip They Will Never ...,John and Ella Robina have shared a wonderful l...,THE LEISURE SEEKER\n\nWritten by\n\nStephen Am...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540,6543896,Al Wazeer Jay,Al Wazeer Jay (Egypt),1986,-1,6,10,,,,,,...,Ibrahim Al Shaqanqiry,,"Osama Abbas, Umran Bahar, Ahmad Bedair, Mohame...",Egypt,,,,,Comedy,,,FOX STAR STUDIOS\n\nPresents\n\nNEERJA\n\nA Bl...
2541,6484982,Newton,"Νιούτον (Greece), Ньютон (Russia)",2017,-1,7,16310,"Asia Pacific Screen Award 2017, Asian Film Awa...",,"Shiladitya Bora, Raghav Gupta, Kanupriya, Suni...",,Varun Kumar,...,Amit Masurkar,"Romil Modi, Tejas Girish Thakker","Rajkummar Rao, Pankaj Tripathi, Anjali Patil, ...",India,"Argentina:13, Australia:MA15+, Germany:Not Rat...",A government clerk on election duty in the con...,"As India, the world's largest democracy, brace...","election, officer, honesty, duty, forest, trib...","Comedy, Drama",Electoral Rumble in the jungle.,"Nutan (Newton) Kumar, a rookie government cler...","EXT. SMALL TOWN STREET- DAY 1\nA red, dusty ro..."
2552,2150716,The Lobster's Cry,"The Lobster's Cry (World-wide, English title),...",2012,-1,7,54,"Grand Prix 2012, CinEuphoria 2014, César 2013,...",,"Nicolas Guiot, Sophie Leclercq, Fabrice Préel-...",,Amandine Lemal,...,Nicolas Guiot,Valérie Trajanovski,"Jana Bittnerová, Tatiana Gontcharova, Anton Ko...","France, Belgium",,A Russian soldier returns from war service in ...,,"chechnya, soldier, return-from-war, russian-fa...","Short, Drama, Family, War",,,THE LOBSTER\n\nWritten by\n\nYorgos Lanthimos ...
2745,93593,On the Silver Globe,"On the Silver Globe (United States), On the Si...",1988,72,7,2396,International Fantasy Film Award 1989,,,,,...,Andrzej Zulawski,,"Andrzej Seweryn, Jerzy Trela, Grazyna Dylag, W...",Poland,United States:Unrated,A team of astronauts land on an inhabitable pl...,"A small group of cosmic explorers, including a...","colonization, mythology, voice-over-narration,...","Adventure, Drama, Fantasy, Sci-Fi",,,SLIVER\n\nby Ira Levin\n\nScreenplay by Joe Es...


In [16]:
non_anglo['screenplay'].sample(20)

2180    NEVER LOOK AWAY\n\nAn original screenplay\nby\...
627     \n\n\n\n\n\n                                JE...
1462    \nLa Belle et la BÃªte (Beauty and the Beast) ...
1385    aC EDS\n\n \n
ROMA\n\n \n\nWritten and Directe...
515     \n                                  "THE GRAPE...
671     \n\n\n\n\n\n                             LES T...
2060    sbs\n\nPRODUCTIONS\n\nFRANKIE\n\nby\n\nMaurici...
838     Movie: MewTwo Strikes Back ***This is not an o...
920     The Seventh Seal\n \nThe night had brought lit...
2128    LAND OF MIN\n\n \n\nGI\n\nWritten by\n\nMartin...
1217    RAN\n\nScreenplay by\nAkira Kurosawa &\nHideo ...
1537    DUPLEX\nby\n\nLARRY DOYLE\n\nRevisions by\nGre...
1386    Southern Belle\n\nBy\n\nRhonnie Fordham\n\n229...
815     \n\n\n\n\n\n\n\n\n\n                     THE P...
811     \n\n\n\n\n\n\n\n\n\n                          ...
1685    MOEBIUS\n\nScreenplay by\nNeil Cohen\n\nBased ...
2193    PAIN AND GLORY\n\nOriginal script by\nPedro Al...
665     \n\n\n

E.g. La Belle et La Bete

In [17]:
non_anglo.loc[1462,'screenplay']

'\nLa Belle et la BÃªte (Beauty and the Beast) \n\nLa Belle et la BÃªte (Beauty and the Beast)\nOPENING TITLE:\n\nChildren believe what we tell them. \nThey have complete faith in us. \nThey believe that a rose plucked \nfrom a garden can bring drama to \na family. They believe that the \nhands of a human beast will smoke \nwhen he kills a victim, and that \nthis beast will be shamed when \nconfronted by a young girl. They \nbelieve in a thousand other simple \nthings. I ask of you a little of \nthis childlike simplicity, and to \nbring us luck let me speak four \ntruly magic words, childhood\'s \nOpen Sesame: \n\n\n\n\t\t\t"Once upon a time..." \n\n\n\nBeauty lives in the country with her father, a 17th-century merchant who has \nlost all his money; her brother, Ludovic, whose only interests are drinking \nand gambling; and her two sisters, Felicie and Adelaide, who are motivated \nentirely by spite, selfishness and vanity. Her brother\'s constant companion, \nAvenant, is a frequent v

Although the tile is in French, the screenplay itself is written in English.

## 1.2 Column Selection

In [18]:
# create a lean version of the dataframe containing only columns clearly relevant to predicting age restrict classification
relevant_cols = ['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict', 'genres', 'screenplay']
df_lean = df[relevant_cols]
df_lean.head()

Unnamed: 0,imdbid,title,year,opening weekend,budget,age restrict,genres,screenplay
0,120770,A Night at the Roxbury,1998,United States:,"$17,000,000 (estimated)","Argentina:13, Australia:M, Brazil:14, Canada:P...","Comedy, Music, Romance",\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,1999,United States:,"$60,000,000 (estimated)","Argentina:13, Australia:M, Canada:PG::(Alberta...","Drama, Romance",AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,1998,"United States: $10,305,957, 16 Aug 1998","$60,000,000 (estimated)","Argentina:13, Australia:PG, Brazil:10, Canada:...","Action, Adventure, Sci-Fi, Thriller",\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,2000,United States:,"$10,000,000 (estimated)","Australia:MA, Finland:K-15, France:Tous public...","Comedy, Drama, Music",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,1998,"United States: $5,533,844, 08 Mar 1998","$15,000,000 (estimated)","Argentina:16, Argentina:18::(cable rating), Au...","Comedy, Crime, Sport",\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


In [19]:
df_lean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2853 entries, 0 to 2852
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   imdbid           2853 non-null   int64 
 1   title            2853 non-null   object
 2   year             2853 non-null   int64 
 3   opening weekend  1736 non-null   object
 4   budget           1623 non-null   object
 5   age restrict     2524 non-null   object
 6   genres           2841 non-null   object
 7   screenplay       2853 non-null   object
dtypes: int64(2), object(6)
memory usage: 178.4+ KB


## 1.3 'age restrict' (target variable)

In [20]:
df_lean['age restrict'][:50]

0     Argentina:13, Australia:M, Brazil:14, Canada:P...
1     Argentina:13, Australia:M, Canada:PG::(Alberta...
2     Argentina:13, Australia:PG, Brazil:10, Canada:...
3     Australia:MA, Finland:K-15, France:Tous public...
4     Argentina:16, Argentina:18::(cable rating), Au...
5     Argentina:13, Australia:MA, Canada:14A::(Manit...
6     Argentina:16, Australia:MA, Belgium:KT/EA, Bra...
7     Argentina:18, Australia:M, Brazil:14, Canada:P...
8     Argentina:13, Australia:M, Canada:14A::(Albert...
9     Argentina:Atp, Australia:M, Finland:K-8, Germa...
10    Argentina:16, Australia:M, Australia:MA15+, Br...
11    Argentina:13, Australia:M, Brazil:10, France:T...
12    Argentina:Atp, Australia:G, Australia:PG::(TV ...
13    Argentina:16, Argentina:18, Australia:R, Brazi...
14    Argentina:Atp, Australia:G, Brazil:Livre, Cana...
15    Argentina:Atp, Australia:G, Brazil:Livre::(vid...
16                                                  NaN
17    Argentina:13, Australia:M, Brazil:12, Finl

In [25]:
# assess missing values in age restrict 
print("Nulls in 'age restrict' column:", df_lean['age restrict'].isnull().sum())

Nulls in 'age restrict' column: 329


In [26]:
# drop rows with missing values for age restrict 
df_clean = df_lean.dropna(how='any', subset='age restrict')
print("Shape of df_clean:", df_clean.shape)
print("Nulls remaining in 'age restrict':", df_clean['age restrict'].isnull().sum())

Shape of df_clean: (2524, 8)
Nulls remaining in 'age restrict': 0


### 1.3.1 Extracting Australian Classification where available

In [27]:
# regex pattern for finding Australian classification
pattern = re.compile(r"Australia:([A-Z0-9]+)")

# filter out the Australian age restrict classification 
def find_aus_classification(string):
    if isinstance(string, str):
        match = re.search(pattern, string)
        return match.group(1) if match else pd.NA
    return pd.NA

aus_classifications = df_clean['age restrict'].apply(find_aus_classification)
aus_classifications

0          M
1          M
2         PG
3         MA
4         MA
        ... 
2848    MA15
2849      PG
2850    MA15
2851      PG
2852    MA15
Name: age restrict, Length: 2524, dtype: object

In [28]:
# check value counts for aus_classifications
print(aus_classifications.value_counts())

age restrict
M       1000
MA15     411
PG       355
MA       194
R        128
G        111
R18       38
A         36
SOA       18
NRC       10
RC         1
Name: count, dtype: int64


From Wiki: 

<h3>Early film classification</h3>

"The Commonwealth Film Censorship Board was created in 1917 to view, classify, and censor films imported from overseas. In the early years of the system there were 3 ratings:[4]

G for "general audiences"
A for "not suitable for children"
SOA for "suitable for adults only"
All ratings were advisory in nature and while distributors were required to display them on advertising, there were no restrictions on children's attendance. As such, films with adult ratings were still routinely censored."
 https://en.wikipedia.org/wiki/Australian_Classification_Board


<i>
<ul> A almost certainly corresponds to G. </ul>
<ul> NRC is effectively PG. </ul>
<ul> SOA is more ambiguous, we'll drop this. </ul>
<ul> Extreme imbalance for 'RC': only one sample means it can't be represented in both training and testing data, so can't be used.  </ul>
</i> 


Assign aus_classifications back to a new DataFrame as a new column.

In [31]:
# create a dataset with aus classifications
df_clean.loc[:, 'age restrict aus'] = aus_classifications
df_clean['age restrict aus']

0                      M
1                      M
2                     PG
3                     MA
4                     MA
                    ... 
2849                  PG
2850                MA15
2851                  PG
2852                MA15
age restrict aus     NaN
Name: age restrict aus, Length: 2525, dtype: object

Creating a separate dataframe df_aus which will only contain rows with an  interpretable Australian classification. 

In [32]:
df_aus = df_clean.dropna(how='any', subset='age restrict aus')
df_aus.shape

(2302, 9)

In [37]:
# Now drop rows where 'age restrict aus' is SOA or RC
df_aus = df_aus[~df_aus['age restrict aus'].isin(['SOA', 'RC'])]
df_aus['age restrict aus'].value_counts()

age restrict aus
M       999
MA15    411
PG      355
MA      194
R       128
G       111
R18      38
A        36
NRC      10
Name: count, dtype: int64

In [22]:
# ## save as CSVs
# df_clean.to_csv('df_clean.csv')
# df_aus.to_csv('df_aus.csv')

# Text Preprocessing

## Cleaning

Note: 
- this was our first, simple attempt at text cleaning.  
- A superior attempt can be found in deployed_approach.ipynb. 
- The most comprehensive attempt can be found in final_preprocessing.ipynb, but this approach was not able to be deployed due to time constraints.


In [38]:
def clean_text(text):
    text = text.lower()  # convert to lower case
    text = re.sub(r'[^\w]', ' ', text)  # remove anything that is not a word char
    text = re.sub(r'[ ]{2,}', ' ', text) # remove extra spaces 
    text = re.sub(r'[ \t]+$', '', text) # remove trailing whitespace
    return text

In [39]:
df_clean.loc[:,'cleaned_screenplays'] = df_clean['screenplay'].apply(clean_text)
df_clean['cleaned_screenplays']

AttributeError: 'float' object has no attribute 'lower'

## Stopword Removal

In [58]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bened\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
stop_words = stopwords.words("english")

def remove_stopwords(text):
    processed_tokens = []
    for t in text.split():
        if t not in stop_words:
            processed_tokens.append(t)
    return " ".join(processed_tokens)

In [68]:
df_clean.loc[:,'cleaned_screenplays'] = df_clean['cleaned_screenplays'].apply(remove_stopwords)
df_clean['cleaned_screenplays'].head()

0    night roxbury written steve koren ferrell chri...
1    first sight ext valley dusk gold light dapplin...
2    avengers screenplay macpherson june 21 1995 1b...
3    bamboozled spike lee black screen hear voice m...
4    big lebowski floating steep scrubby slope hear...
Name: cleaned_screenplays, dtype: object

## Stemming

In [70]:
from nltk.stem import SnowballStemmer

In [89]:
def stem(text):
    stemmed_tokens = []
    stemmer = SnowballStemmer("english")
    for t in text.split():
        t = stemmer.stem(t)
        stemmed_tokens.append(t)
    return " ".join(stemmed_tokens)

In [76]:
df_clean.columns

Index(['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict',
       'genres', 'screenplay', 'age restrict aus', 'cleaned_screenplays'],
      dtype='object')

In [90]:
df_clean.loc[:,'cleaned_screenplays'] = df_clean['cleaned_screenplays'].apply(stem)
df_clean['cleaned_screenplays']

0       night roxburi written steve koren ferrel chris...
1       first sight ext valley dusk gold light dappl a...
2       aveng screenplay macpherson june 21 1995 1blue...
3       bamboozl spike lee black screen hear voic malc...
4       big lebowski float steep scrubbi slope hear ma...
                              ...                        
2848    war world screenplay josh friedman david koepp...
2849    june 29 1994 sleep dan sullivan fred lebow ext...
2850    wind river written taylor sheridan open snow c...
2851    wrong accus pat proft proft thing intern revis...
2852    never realli screenplay lynn ramsay base novel...
Name: cleaned_screenplays, Length: 2524, dtype: object

In [106]:
df_clean['age restrict aus'].value_counts()

age restrict aus
M        877
PG       335
MA15+    314
MA       173
R        103
G         89
Name: count, dtype: int64

If we narrow scope to Aus classifications for now, we have 6 target classes

# Feature and Target Engineering

Encode Target (Aus ratings)

In [115]:
aus_target_map = {
    'G': 0,
    'PG': 1,
    'M': 2,
    'MA': 3,
    'MA15+': 4,
    'R': 5
} 

y = df_aus['age restrict aus'].map(aus_target_map) 
y.value_counts()

age restrict aus
2    877
1    335
4    314
3    173
5    103
0     89
Name: count, dtype: int64

In [156]:
def compute_class_balance(y): 
    classes = sorted(list(y.unique()))
    value_counts = y.value_counts()
    for c in classes: 
        class_ratio = value_counts[c] / len(y) * 100
        print(f"{c} represents {class_ratio}% of target data")

In [157]:
compute_class_balance(y) 

0 represents 4.706504494976203% of target data
1 represents 17.715494447382337% of target data
2 represents 46.37757800105764% of target data
3 represents 9.148598625066102% of target data
4 represents 16.604970914859862% of target data
5 represents 5.4468535166578524% of target data


There is a significant class imbalance with regard to movies rated M (3); the other imbalances are less significant.

In [154]:
X = df_aus['screenplay'].apply(clean_text)
X = X.apply(remove_stopwords)
X = X.apply(stem)
X

0       night roxburi written steve koren ferrel chris...
1       first sight ext valley dusk gold light dappl a...
2       aveng screenplay macpherson june 21 1995 1blue...
4       big lebowski float steep scrubbi slope hear ma...
5       boy side scene 1 jane thank jane deluca johnni...
                              ...                        
2847    vo ud twenti pirst centuri por trale wreeten b...
2849    june 29 1994 sleep dan sullivan fred lebow ext...
2850    wind river written taylor sheridan open snow c...
2851    wrong accus pat proft proft thing intern revis...
2852    never realli screenplay lynn ramsay base novel...
Name: screenplay, Length: 1891, dtype: object

# Data Split 

## Splitting by 'year'

In [121]:
df_aus['year'].describe()

count    1891.000000
mean     1997.916446
std        16.889852
min      1922.000000
25%      1991.000000
50%      2001.000000
75%      2010.000000
max      2021.000000
Name: year, dtype: float64

The most recent 25% of the data comes from 2010 onwards.  We will take this as test data in the hopes our model will still work in classifying the most recent films. 

In [131]:
test_index = df_aus.loc[df_aus['year'] > 2010].index
test_index

Index([  39,   43,   44,   46,   48,   51,   52,   55,  101,  110,
       ...
       2829, 2830, 2832, 2837, 2839, 2844, 2845, 2847, 2850, 2852],
      dtype='int64', length=420)

In [135]:
test_ratio = len(test_index) / len(df_aus.index)
print(f"Films released after 2010 (data reserved for testing) represent {test_ratio} of the dataset")

Films released after 2010 (data reserved for testing) represent 0.222104706504495 of the dataset


In [141]:
from sklearn.model_selection import train_test_split 

X_test = X.loc[test_index]
y_test = y.loc[test_index]

X_data = X.loc[~X.index.isin(test_index)]
y_data = y.loc[~y.index.isin(test_index)]

# randomized split for train-val 
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=1)

compute_class_balance(y_train)
compute_class_balance(y_val)
compute_class_balance(y_test)

NameError: name 'compute_class_balance' is not defined