# Parsing and Extraction

PDF

In [None]:
!pip install PyMuPDF

In [2]:
import pandas as pd
import fitz
import os
import re
from typing import List, Dict

In [3]:
def read_pdf(folder_path):
  pdf_files = []
  for filename in os.listdir(folder_path):
      if filename.endswith('.pdf'):
          pdf_files.append(os.path.join(folder_path, filename))
  return pdf_files

In [4]:
pdf_files = read_pdf('/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays')

In [None]:
len(pdf_files)

1078

In [None]:
pdf_files[0:4]

['/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays/A History of Violence.pdf',
 '/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays/Doom.pdf',
 '/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays/The Descent.pdf',
 '/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays/Capote.pdf']

In [None]:
def is_text_pdf(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text = page.get_text("text").strip()
                if text:  # jeśli choć jedna strona ma tekst → to tekstowy PDF
                    return True
        return False
    except Exception:
        return False

In [None]:
def extract_screenplay_elements(pdf_files):
    elements = []

    for pdf_file in pdf_files:
        title = pdf_file.split('/')[-1].rsplit('.', 1)[0]
        print(title)

        if not is_text_pdf(pdf_file):
            print(f"SCAN: {title}")
            elements.append({'title': title,'format': 'scan','label': None,'text': None,})
            continue

        doc = fitz.open(pdf_file)

        for page_num in range(1, len(doc)):
            page = doc[page_num]
            width = page.rect.width

            text_content = page.get_text()
            lines = text_content.split('\n')
            words_data = page.get_text("words")  # (x0, y0, x1, y1, word, block, line, word_no)

            line_positions = {}
            current_line_words = []
            prev_line_no = None

            for word_info in words_data:
                x0, y0, x1, y1, word, block_no, line_no, word_no = word_info

                if prev_line_no is not None and line_no != prev_line_no:
                    if current_line_words:
                        line_text = " ".join(w[4] for w in current_line_words)
                        first_x = current_line_words[0][0]
                        last_x = current_line_words[-1][2]
                        line_positions[line_text.strip()] = (first_x, last_x)
                    current_line_words = []

                current_line_words.append(word_info)
                prev_line_no = line_no

            if current_line_words:
                line_text = " ".join(w[4] for w in current_line_words)
                first_x = current_line_words[0][0]
                last_x = current_line_words[-1][2]
                line_positions[line_text.strip()] = (first_x, last_x)

            for line in lines:
                text = line.strip()
                if not text:
                    continue

                if text in line_positions:
                    x0_first, x1_last = line_positions[text]
                    indent_left = x0_first / width * 1000
                    indent_right = (width - x1_last) / width * 1000
                else:
                    indent_left = 0
                    indent_right = 0

                is_upper = text.isupper()

                if text.startswith("(") and text.endswith(")"):
                    label = "parenthetical"
                elif any(keyword in text.upper() for keyword in ["INT.", "EXT.", "FADE IN", "FADE OUT", "CUT TO", "CONTINUED", "CONTINUE"]):
                    label = "scene_heading"
                elif is_upper and len(text.split()) <= 4 and indent_left > 200 and indent_right > 200:
                    label = "character"
                elif indent_left > 200 and indent_right > 200:
                    label = "dialogue"
                else:
                    label = "action"

                elements.append({'title': title,'format': 'text','label': label,'text': text})

        doc.close()

    return elements

In [None]:
elements = extract_screenplay_elements(pdf_files)

In [None]:
df = pd.DataFrame(elements)

In [None]:
df

Unnamed: 0,title,format,label,text
0,A History of Violence,text,scene_heading,EXT. CALIFORNIA HIGHWAY -
1,A History of Violence,text,character,MORNING
2,A History of Violence,text,dialogue,A battered
3,A History of Violence,text,dialogue,old
4,A History of Violence,text,dialogue,motel
...,...,...,...,...
5549486,Zathura: A Space Adventure,text,action,"them, a whole wall of imagination waiting to b..."
5549487,Zathura: A Space Adventure,text,action,time the door is opened.
5549488,Zathura: A Space Adventure,text,action,"Dad looks fondly at the games, already can't w..."
5549489,Zathura: A Space Adventure,text,action,He pulls the chain on the light bulb and we


In [None]:
df.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated.csv', index=False, encoding="utf-8-sig")

TXT

In [None]:
def read_txt(folder_path):
    txt_files = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            txt_files.append(os.path.join(folder_path, filename))
    return txt_files

In [None]:
txt_files = read_txt('/content/drive/MyDrive/PROJECT/movie_screenplays/raw_screenplays')

In [None]:
def read_txt_with_indents(txt_file):
    lines_data = []

    try:
        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
            all_lines = f.readlines()
    except Exception as e:
        print(f"!!! ERROR UTF-8 for {txt_file}: {e}")
        try:
            with open(txt_file, 'r', encoding='latin-1') as f:
                all_lines = f.readlines()
        except Exception as e2:
            print(f"!!! ERROR Latin-1 for {txt_file}: {e2}")
            return []

    if not all_lines:
        print(f"!!! Pusty plik: {txt_file}")
        return []

    actual_max_width = max(len(line.rstrip('\n\r')) for line in all_lines)
    if actual_max_width == 0:
        actual_max_width = 80

    for line in all_lines:
        line = line.rstrip('\n\r')
        line_clean = re.sub(r'<[^>]+>', '', line)

        indent_left = len(line_clean) - len(line_clean.lstrip())
        text = line_clean.strip()

        if not text:
            lines_data.append({'text': '','indent_left': 0,'is_empty': True})
            continue


        indent_left_normalized = (indent_left / actual_max_width) * 1000

        lines_data.append({'text': text,'indent_left': indent_left_normalized,'is_empty': False})

    return lines_data

In [None]:
elements = []
for txt_file in txt_files:

    title = txt_file.split('/')[-1].split('.')[0]
    lines_data = read_txt_with_indents(txt_file)

    for line in lines_data:

        if line['is_empty']:
            continue

        text = line['text']
        indent_left = line['indent_left']
        is_upper = text.isupper()

        if text.startswith("(") and text.endswith(")"):
            label = "parenthetical"
        elif any(keyword in text.upper() for keyword in ["INT.", "EXT.", "FADE IN", "FADE OUT", "CUT TO", "CONTINUED", "CONTINUE", "THE END"]):
            label = "scene_heading"
        elif is_upper and len(text.split()) <= 4 and indent_left > 300:
            label = "character"
        elif indent_left > 200:
            label = "dialogue"
        else:
            label = "action"

        elements.append({'title': title,'format': 'text','label': label,'text': text})


In [None]:
df = pd.DataFrame(elements)

In [None]:
df.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_txt.csv', index=False, encoding="utf-8-sig")

SCAN (skip)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated.csv')

In [None]:
df = df[df['format'] != 'scan']

In [None]:
df.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_pdf.csv', index=False, encoding="utf-8-sig")

MERGE

In [None]:
df_pdf = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_pdf.csv')

In [None]:
df_txt = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_txt.csv')

In [None]:
len(df_pdf)

5549402

In [None]:
len(df_txt)

1123689

In [None]:
df_pdf

Unnamed: 0,title,format,label,text
0,A History of Violence,text,scene_heading,EXT. CALIFORNIA HIGHWAY -
1,A History of Violence,text,character,MORNING
2,A History of Violence,text,dialogue,A battered
3,A History of Violence,text,dialogue,old
4,A History of Violence,text,dialogue,motel
...,...,...,...,...
5549397,Zathura: A Space Adventure,text,action,"them, a whole wall of imagination waiting to b..."
5549398,Zathura: A Space Adventure,text,action,time the door is opened.
5549399,Zathura: A Space Adventure,text,action,"Dad looks fondly at the games, already can't w..."
5549400,Zathura: A Space Adventure,text,action,He pulls the chain on the light bulb and we


In [None]:
def filter_df(df):
    rows_to_keep = []
    i = 0
    n = len(df)

    while i < n:
        if df.loc[i, "label"] == "character":
            if i + 1 < n and df.loc[i + 1, "label"] == "dialogue":
                rows_to_keep.append(i)
                j = i + 1
                while j < n and df.loc[j, "label"] == "dialogue":
                    rows_to_keep.append(j)
                    j += 1
                i = j
            else:
              i += 1
        else:
          i += 1
    df_filtered = df.loc[rows_to_keep].reset_index(drop=True)
    return df_filtered

In [None]:
df_pdf_filtered = filter_df(df_pdf)

In [None]:
df_pdf_filtered

Unnamed: 0,title,format,label,text
0,A History of Violence,text,character,MORNING
1,A History of Violence,text,dialogue,A battered
2,A History of Violence,text,dialogue,old
3,A History of Violence,text,dialogue,motel
4,A History of Violence,text,dialogue,by the
...,...,...,...,...
1994291,Zathura: A Space Adventure,text,character,DANNY
1994292,Zathura: A Space Adventure,text,dialogue,I know.
1994293,Zathura: A Space Adventure,text,character,DAY
1994294,Zathura: A Space Adventure,text,dialogue,/: {i


In [None]:
df_txt_filtered = filter_df(df_txt)

In [None]:
df_txt_filtered[100:110]

Unnamed: 0,title,format,label,text
100,John Wick,text,character,AURELIO
101,John Wick,text,dialogue,You pull a gun? On me? In my
102,John Wick,text,dialogue,house?
103,John Wick,text,character,AURELIO (CONT'D)
104,John Wick,text,dialogue,Flick off the safety.
105,John Wick,text,character,AURELIO (CONT'D)
106,John Wick,text,dialogue,Pull back the hammer.
107,John Wick,text,character,AURELIO (CONT'D)
108,John Wick,text,dialogue,"Now, either shoot me..."
109,John Wick,text,character,VIKTOR


In [None]:
def clean_dialogues(df):
    df["text"] = df["text"].astype(str)
    df["text"] = df["text"].str.replace(r"[^A-Za-z\s]", "", regex=True)
    df["text"] = df["text"].str.replace(r"\s+", " ", regex=True).str.strip()
    df = df[df["text"] != ""].reset_index(drop=True)
    return df

In [None]:
df_pdf_clean = clean_dialogues(df_pdf_filtered)

In [None]:
df_txt_clean = clean_dialogues(df_txt_filtered)

In [None]:
def merge_dialogues(df_filtered):
    merged_rows = []
    current_dialogue = None

    for _, row in df_filtered.iterrows():
        if row["label"] == "dialogue":
            if current_dialogue is None:
                current_dialogue = row.copy()
            else:
                current_dialogue["text"] += " " + row["text"]
        else: #charcter
            if current_dialogue is not None:
                merged_rows.append(current_dialogue)
                current_dialogue = None

            merged_rows.append(row)

    if current_dialogue is not None:
        merged_rows.append(current_dialogue)

    df_merged = pd.DataFrame(merged_rows).reset_index(drop=True)
    return df_merged

In [None]:
df_pdf_merged = merge_dialogues(df_pdf_clean)

In [None]:
df_txt_merged = merge_dialogues(df_txt_clean)

In [None]:
df_pdf_merged

Unnamed: 0,title,format,label,text
0,A History of Violence,text,character,MORNING
1,A History of Violence,text,dialogue,A battered old motel by the side of the road T...
2,A History of Violence,text,character,A
3,A History of Violence,text,dialogue,large pathetic green papier mache dinosaur som...
4,A History of Violence,text,character,BILLY
...,...,...,...,...
1190434,Zathura: A Space Adventure,text,dialogue,at didnt happen I fell in front of the TV
1190435,Zathura: A Space Adventure,text,character,DANNY
1190436,Zathura: A Space Adventure,text,dialogue,I know
1190437,Zathura: A Space Adventure,text,character,DAY


In [None]:
df_txt_merged

Unnamed: 0,title,format,label,text
0,John Wick,text,character,JOHN WICK
1,John Wick,text,dialogue,Written by Derek Kolstad
2,John Wick,text,character,JOHN
3,John Wick,text,dialogue,This is John
4,John Wick,text,character,JOHN
...,...,...,...,...
148393,You Can Count on Me,text,dialogue,Come on Sammy Look at me Look at me
148394,You Can Count on Me,text,character,TERRY
148395,You Can Count on Me,text,dialogue,Hey Sammy Remember when we were kids remember ...
148396,You Can Count on Me,text,character,SAMMY


In [None]:
df_pdf_merged.drop(columns = ["format"], inplace = True)

In [None]:
df_txt_merged.drop(columns = ["format"], inplace = True)

In [None]:
df_merged = pd.concat([df_pdf_merged, df_txt_merged], ignore_index=True)

In [None]:
df_merged

Unnamed: 0,title,label,text
0,A History of Violence,character,MORNING
1,A History of Violence,dialogue,A battered old motel by the side of the road T...
2,A History of Violence,character,A
3,A History of Violence,dialogue,large pathetic green papier mache dinosaur som...
4,A History of Violence,character,BILLY
...,...,...,...
1338832,You Can Count on Me,dialogue,Come on Sammy Look at me Look at me
1338833,You Can Count on Me,character,TERRY
1338834,You Can Count on Me,dialogue,Hey Sammy Remember when we were kids remember ...
1338835,You Can Count on Me,character,SAMMY


In [None]:
df_merged.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_merged.csv', index=False, encoding="utf-8-sig")

In [None]:
df_characters = df_merged[df_merged["label"] == "character"]

In [None]:
df_characters.reset_index(drop = True)

Unnamed: 0,title,label,text
0,A History of Violence,character,MORNING
1,A History of Violence,character,A
2,A History of Violence,character,BILLY
3,A History of Violence,character,LELAND
4,A History of Violence,character,BILLY
...,...,...,...
670733,You Can Count on Me,character,SAMMY
670734,You Can Count on Me,character,TERRY
670735,You Can Count on Me,character,TERRY
670736,You Can Count on Me,character,TERRY


In [None]:
df_characters.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_merged_characters.csv', index=False, encoding="utf-8-sig")