### **Web Scraping**

In [1]:
!pip install requests
# The "requests" library is a popular Python library for making HTTP requests, simplifying the process of sending HTTP/1.1 requests.
# They are used to interact with web APIs or performing HTTP requests, as the "requests" library simplifies the process of handling HTTP communication in Python.



In [2]:
!pip install beautifulsoup4
#  It is used for web scraping purposes to pull the data out of HTML and XML files.
#  It provides Pythonic idioms for iterating, searching, and modifying the parse tree.




In [3]:
import requests
from bs4 import BeautifulSoup
# Here we are importing the libraries for doing the things like web scraping.


In [4]:
url = "https://imsdb.com/scripts/A-Quiet-Place.html"
# Here the website url is given for the variable "url"

In [5]:
page=requests.get(url)
# here we are requesting the url of the website through the object "page"

In [6]:
soup = BeautifulSoup(page.text, 'html.parser')
# HTML content of a webpage stored in the variable page.text. The 'html.parser' argument specifies the parser to be used by Beautiful Soup.
# This allows you to navigate, search, and extract information from the HTML document using Beautiful Soup's

In [7]:
print(soup)
# Here we are printing the raw content i.e., html content from the web site

<html>
<head>
<!-- Google tag (gtag.js) -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-W5BXG8HCH3"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-W5BXG8HCH3');
</script>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="EN" http-equiv="Content-Language"/>
<meta content="Document" name="objecttype"/>
<meta content="INDEX, FOLLOW" name="ROBOTS"/>
<meta content="Movie scripts, Film scripts" name="Subject"/>
<meta content="General" name="rating"/>
<meta content="Global" name="distribution"/>
<meta content="2 days" name="revisit-after"/>
<link href="/style.css" rel="stylesheet" type="text/css"/>
</head>
<body bottommargin="0" id="mainbody" topmargin="0">
<table border="0" cellpadding="0" cellspacing="0" 

In [8]:
print(soup.prettify())
# it returns a neat formatted and indented string representation of the HTML or XML parse tree.
# It enhances the readability of the document's structure, making it easier for developers to inspect and understand the hierarchy of elements.

<html>
 <head>
  <!-- Google tag (gtag.js) -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-W5BXG8HCH3">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-W5BXG8HCH3');
  </script>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="true" name="HandheldFriendly"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="EN" http-equiv="Content-Language"/>
  <meta content="Document" name="objecttype"/>
  <meta content="INDEX, FOLLOW" name="ROBOTS"/>
  <meta content="Movie scripts, Film scripts" name="Subject"/>
  <meta content="General" name="rating"/>
  <meta content="Global" name="distribution"/>
  <meta content="2 days" name="revisit-after"/>
  <link href="/style.css" rel="stylesheet" type="text/css"/>
 </head>
 <body bottommargin="0" id="mainbody" topmargin="0">
  <table border

In [9]:
scrap_text= soup.find('td', class_='scrtext').text.strip()
print(scrap_text)
# Here we are finding the requiured content from the html page and printing it using the tag like "td" and using it's related class belongs "scrtext"
# Here we can see the text data in clear.

A QUIET PLACE



                          Screenplay by

              Bryan Woods, Scott Beck & John Krasinski


                            Story by

                     Bryan Woods & Scott Beck





    BLACK

    We hear, very clearly, the sound of light wind.


1   EXT. TOWN - MAIN ST. - LATE AFTERNOON                        1

    We come up on... a streetlight. There is no illumination...
    and no movement. We hold on it for a long moment when
    suddenly... The streetlight bobs... and then begins to sway.
    We slowly begin to rise up on the streetlight to reveal... a
    small bird has landed on it. We continue to rise to reveal,
    behind the bird...

    Wind blows through the   gargantuan evergreens that seem to
    engulf the narrow main   street of a small town in upstate New
    York. Shop windows and   cars on either side covered in dust,
    the place seems frozen   in time. There is no movement.

    In the very middle of the

In [12]:
# Saving the formatted text to a file
file_path = "/content/Quite_place.txt"

with open(file_path, 'w') as file:
    file.write(scrap_text)
    # file.close()
print(f"Formatted text saved to {file_path}")

# Here we created a text file in the folder "Capstone_Project", and given its path to write the exctracted story in the text file.


Formatted text saved to /content/Quite_place.txt


### **Script Parser & Data Format Conversions**

In [None]:
import pandas as pd
import os
import re

# installed the required dependencies for the task.

In [13]:
import pandas as pd
import re

def scriptParser(input_file_path, output_csv_path):
    scriptText = []  # To save the script text.
    sceneText = []  # To save scene headings.
    sceneText.append('')  # Placeholder for the first scene.
    regstr = ''  # Aggregates lines until a new scene is found.

    # Compile regex patterns for identifying scenes and page numbers.
    patternA = re.compile(r'^\d.*\d[a-zA-Z0-9]*\n?$|^\d+\s?.$', flags=re.MULTILINE)
    patternB = re.compile(r'^\bEXT\b.+$|^\bINT\b.+$', flags=re.MULTILINE)
    patternC = re.compile(r'^\bI/E\b.+$', flags=re.MULTILINE)
    page_number_pattern = re.compile(r'^\d+\.$')

    with open('/content/Quite_place.txt', 'r', encoding='utf-8') as file:
        preText = file.readlines()

    for i, text in enumerate(preText):
        if page_number_pattern.match(text.strip()):
            continue

        if patternA.match(text.strip()) or patternB.match(text.strip()) or patternC.match(text.strip()):
            if regstr:
                # Append accumulated script text to scriptText list.
                scriptText.append(regstr.strip())
                regstr = ''
            sceneText.append(text.strip())
        else:
            # Append current line to regstr, preserving punctuation.
            regstr += ' ' + text.strip()

        if i == len(preText) - 1:
            # Ensure the script text for the last scene is captured.
            scriptText.append(regstr.strip())

    # Creating DataFrame with scenes and scripts.
    df_movie = pd.DataFrame({'Scenes': sceneText, 'Scripts': scriptText})

    # Saving the DataFrame to CSV.
    df_movie.to_csv('/content/Qiut.csv', index=False)

# Example usage of the function.
scriptParser("/content/Quite_place.txt", "/content/Qiut.csv")


In [14]:
df = pd.read_csv('/content/Qiut.csv')
df.head(5)
# Here we are reading the CSV file and printing first five rows of the file.

Unnamed: 0,Scenes,Scripts
0,,"A QUIET PLACE Screenplay by Bryan Woods, S..."
1,1 EXT. TOWN - MAIN ST. - LATE AFTERNOON ...,We come up on... a streetlight. There is no il...
2,2 INT. GENERAL STORE - FRONT - LATE AFTERNOO...,Slowly pushing through the doorway of a large ...
3,3 INT. GENERAL STORE AISLES - LATE AFTERNOON...,"WE MOVE SLOWLY across the floor, down the midd..."
4,4 INT. AISLE - LATE AFTERNOON ...,"A 10 year old girl stands on her toes, barely ..."


### **Setting Background Format**

In [15]:
def breakScene(df, time_pos):
    location = []
    time = []
    reg = re.compile(r'\d+\w?')
    for sc in df['Scenes']:
        sc = reg.sub('',str(sc)).strip()
        sce = sc.split('-')
        location.append(sce[0])
        last = len(sce)-1
        if (last != 0 and time_pos == 'last'):
            time.append(sce[last])
        elif (last != 0 and time_pos == 'middle'):
            time.append(sce[round(last/2)])
        else:
            time.append('')
    df_re = pd.DataFrame({'Location': location,'Time': time, 'Scripts':df['Scripts']})
    df_re.to_csv("/content/Update_Background_set.csv", index=False)
    return df_re

# Here The code defines a function breakScene that takes a DataFrame (df) and a time_pos parameter as inputs.
# Here The function aims to process and extract information from the 'Scenes' column of the DataFrame.
# It uses a regular expression (reg) to remove numeric characters and splits each scene entry by a hyphen.
# The location information is extracted from the first part of the split, and the time information is determined based on the time_pos parameter, considering options like 'last', 'middle', or an empty string.
# The extracted information is then organized into a new DataFrame (df_re) with columns for 'Location', 'Time', and the original 'Scripts'. Finally, the resulting DataFrame is saved to a CSV file, and the function returns the processed DataFrame.

In [16]:
#Accept last or middle
pos = 'last'
new_df = breakScene(df, pos)
df = new_df
df

Unnamed: 0,Location,Time,Scripts
0,,,"A QUIET PLACE Screenplay by Bryan Woods, S..."
1,EXT. TOWN,LATE AFTERNOON,We come up on... a streetlight. There is no il...
2,INT. GENERAL STORE,LATE AFTERNOON,Slowly pushing through the doorway of a large ...
3,INT. GENERAL STORE AISLES,LATE AFTERNOON,"WE MOVE SLOWLY across the floor, down the midd..."
4,INT. AISLE,LATE AFTERNOON,"A 10 year old girl stands on her toes, barely ..."
...,...,...,...
119,INT. FARMHOUSE,DAWN,CLOSE ON MONITORS... As EVELYN's face comes in...
120,INT. FARMHOUSE,DAWN,CLOSE ON LEE's shotgun laying on the floor. Su...
121,INT. FARMHOUSE,DAWN,"THROUGH THE LIVING ROOM WINDOW, WE LOOK OUT on..."
122,INT. FARMHOUSE,DAWN,WE LOOK DIRECTLY up the basement stairs toward...


In [17]:
df.to_csv("/content/Update_Background_set.csv", index=False)