# B- Write an automated script to search, find, download the screenplay of each title and store each screenplay as (Semi) structured data.

## 1- Import libraries

In [26]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import os
from tqdm import tqdm
import string
import re

## 2- load the top 250 movies dataset and create a dataframe of the movie names

In [27]:
df = pd.read_csv('abeer_badawi.csv')
df = df['Movie Name']
df = pd.DataFrame(df)

## 3- Read the movie scripts from the url

In [3]:
def format_filename(s):
    valid_chars = "-() %s%s%s" % (string.ascii_letters, string.digits, "%")
    filename = ''.join(c for c in s if c in valid_chars)
    filename = filename.replace('%20', ' ')
    filename = filename.replace('%27', '')
    filename = re.sub(r'-+', '-', filename).strip()
    return filename


def get_soup(url):
    page = urllib.request.Request(url)
    result = urllib.request.urlopen(page)
    resulttext = result.read()
    soup = BeautifulSoup(resulttext, 'html.parser')
    return soup

def get_pdf_text(url):
    doc = os.path.join("scripts", "document.pdf")
    result = urllib.request.urlopen(url)
    f = open(doc, 'wb')
    f.write(result.read())
    f.close()
    try:
        text = textract.process(doc, encoding='utf-8').decode('utf-8')
    except:
        text = ""
    if os.path.isfile(doc):
        os.remove(doc)
    return text

def get_doc_text(url):
    doc = os.path.join("scripts", "document.doc")
    result = urllib.request.urlopen(url)
    f = open(doc, 'wb')
    f.write(result.read())
    f.close()
    try:
        text = textract.process(doc, encoding='utf-8').decode('utf-8')
    except:
        text = ""
    if os.path.isfile(doc):
        os.remove(doc)
    return text

## 4- Find all screenplays from dailyscript website

In [33]:
def get_dailyscript():
    ALL_URL_1 = "https://www.dailyscript.com/movie.html"
    ALL_URL_2 = "https://www.dailyscript.com/movie_n-z.html"
    BASE_URL = "https://www.dailyscript.com/"
    DIR = os.path.join("dailyscript")

    if not os.path.exists(DIR):
        os.makedirs(DIR)

    soup_1 = get_soup(ALL_URL_1)
    soup_2 = get_soup(ALL_URL_2)

    movielist = soup_1.find_all('ul')[0].find_all('p')
    movielist_2 = soup_2.find_all('ul')[0].find_all('p')
    movielist += movielist_2

    # print(movielist)

    for movie in tqdm(movielist):
        script_url = movie.contents
        if len(script_url) < 2:
            continue
        script_url = movie.find('a').get('href')
        # print(script_url)

        text = ""
        name = movie.find('a').text

        if script_url.endswith('.pdf'):
            text = get_pdf_text(BASE_URL + urllib.parse.quote(script_url))

        elif script_url.endswith('.html'):
            script_soup = get_soup(BASE_URL + urllib.parse.quote(script_url))
            doc = script_soup.pre
            if doc:
                text = script_soup.pre.get_text()
            else:
                text = script_soup.get_text()
            # name = script_url.split("/")[-1].split('.html')[0]
        
        elif script_url.endswith('.htm'):
            script_soup = get_soup(BASE_URL + urllib.parse.quote(script_url))
            text = script_soup.pre.get_text()
            # name = script_url.split("/")[-1].split('.htm')[0]
        
        elif script_url.endswith('.txt'):
            script_soup = get_soup(BASE_URL + urllib.parse.quote(script_url))
            text = script_soup.get_text()
            # name = script_url.split("/")[-1].split('.txt')[0]

        if text == "" or name == "":
            continue

        name = format_filename(name)
        with open(os.path.join(DIR, name + '.txt'), 'w', errors="ignore") as out:
            out.write(text)
print("Fetching from weeklyscript")
get_dailyscript()
print()

## 5- Find all screenplays from sfy website

In [None]:
def get_sfy():
    ALL_URL = "https://sfy.ru/scripts"
    BASE_URL = "https://sfy.ru"
    DIR = os.path.join("scripts", "unprocessed", "sfy")

    if not os.path.exists(DIR):
        os.makedirs(DIR)

    soup = get_soup(ALL_URL)
    movielist = soup.find_all('div', class_='row')[1]
    unwanted = movielist.find('ul')
    unwanted.extract()
    movielist = movielist.find_all('a')

    for movie in tqdm(movielist):
        script_url = movie.get('href')
        name = re.sub(r"(\d{4})", "", format_filename(
            movie.text)).replace('()', "").strip("-")
        text = ""
        if not script_url.startswith('https'):
            script_url = BASE_URL + script_url

        if script_url.endswith('.pdf'):
            try:
                text = get_pdf_text(script_url)
            except:
                continue
        else:
            try:
                script_soup = get_soup(script_url).pre
                if script_soup:
                    text = script_soup.get_text()
            except:
                continue

        if text == "" or name == "":
            continue

        with open(os.path.join(DIR, name + '.txt'), 'w', errors="ignore") as out:
            out.write(text)
print("Fetching from weeklyscript")
get_sfy()
print()

## 6- Find all screenplays from screenplays website

In [15]:
def get_screenplays():
    ALL_URL = "https://www.screenplays-online.de/"
    BASE_URL = "https://www.screenplays-online.de/"
    DIR = os.path.join("scripts", "unprocessed", "screenplays")

    if not os.path.exists(DIR):
        os.makedirs(DIR)
        
    soup = get_soup(ALL_URL)
    mlist = soup.find_all('table', class_="screenplay-listing")[0].find_all("a")
    movielist = [x for x in mlist if x.get('href').startswith("screenplay")]

    for movie in tqdm(movielist):
        name = format_filename(movie.text)
        script_url = movie.get('href')
        # if script_url.startswith("screenplay"):

        script_soup = get_soup(BASE_URL + urllib.parse.quote(script_url))
        # print(script_soup.pre.get_text())
        if not script_soup.pre:
            continue
        text = script_soup.pre.get_text()

        with open(os.path.join(DIR, name + '.txt'), 'w', errors="ignore") as out:
            out.write(text)
print("Fetching from weeklyscript")
get_screenplays()
print()

Fetching from weeklyscript


 96%|████████████████████████████████████████████████████████████████████████████▌   | 113/118 [05:09<00:08,  1.69s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [05:26<00:00,  2.77s/it]





## 7- Find all screenplays from scriptsavant website

In [None]:
def get_scriptsavant():
    ALL_URL_1 = "https://thescriptsavant.com/free-movie-screenplays-am/"
    ALL_URL_2 = "https://thescriptsavant.com/free-movie-screenplays-nz/"
    BASE_URL = "http://www.awesomefilm.com/"
    DIR = os.path.join("scripts", "unprocessed", "scriptsavant")

    if not os.path.exists(DIR):
        os.makedirs(DIR)

    soup_1 = get_soup(ALL_URL_1)
    soup_2 = get_soup(ALL_URL_2)

    movielist = soup_1.find_all('tbody')[0].find_all('a')
    movielist_2 = soup_2.find_all('div', class_='fusion-text')[0].find_all('a')
    movielist += movielist_2


    for movie in tqdm(movielist):
        name = format_filename(movie.text.strip())
        script_url = movie.get('href')

        if not script_url.endswith('.pdf'):
            continue

        try:
            text = get_pdf_text(script_url)

        except:
            continue

        if text == "" or name == "":
            continue
        
        with open(os.path.join(DIR, name + '.txt'), 'w', errors="ignore") as out:
            out.write(text)
print("Fetching from weeklyscript")
get_scriptsavant()
print()

## 8- Add all scripts to one file and clean the file name

In [None]:
from fuzzywuzzy import fuzz
from os import listdir, makedirs
from os.path import isfile, join, sep, getsize, exists
from tqdm import tqdm
import re
import itertools
import string

DIR_DAILY = join("scripts", "unprocessed", "dailyscript")
DIR_SCREEN = join("scripts", "unprocessed", "screenplays")
DIR_SAVANT = join("scripts", "unprocessed", "scriptsavant")
DIR_SFY = join("scripts", "unprocessed", "sfy")

DIR_FILTER = join("scripts", "filtered")
DIR_FINAL = join("scripts", "final")


daily = [join(DIR_DAILY, f) for f in listdir(DIR_DAILY) if isfile(
    join(DIR_DAILY, f))and getsize(join(DIR_DAILY, f)) > 3000]
screen = [join(DIR_SCREEN, f) for f in listdir(DIR_SCREEN) if isfile(
    join(DIR_SCREEN, f))and getsize(join(DIR_SCREEN, f)) > 3000]

savant = [join(DIR_SAVANT, f) for f in listdir(DIR_SAVANT) if isfile(
    join(DIR_SAVANT, f))and getsize(join(DIR_SAVANT, f)) > 3000]
sfy = [join(DIR_SFY, f) for f in listdir(DIR_SFY) if isfile(
    join(DIR_SFY, f))and getsize(join(DIR_SFY, f)) > 3000]

sources = {
    'savant': savant,
    'daily': daily,
    'screen': screen,
    'sfy': sfy
}

forbidden = ["the", "a", "an", "and", "or", "part",
             "vol", "chapter", "movie"]
symbols = ["!", "@", "#", "$", "%", "^", "&", "*",
           "_", "+", ":", ".", ",", "?", "\'", "/"]

def remove_duplicates(arr, comb):

    for (x, y) in tqdm(comb):
        x = x.split('.txt')[0]
        y = y.split('.txt')[0]
     

        name_x = x.split(sep)[-1].lower().split("-")
        name_y = y.split(sep)[-1].lower().split("-")

        name_x = list(filter(lambda a: a not in forbidden, name_x))
        name_y = list(filter(lambda a: a not in forbidden, name_y))

        name_x = "".join(name_x).strip()
        name_y = "".join(name_y).strip()

        name_x = "".join([x for x in name_x if x not in symbols])
        name_y = "".join([x for x in name_y if x not in symbols])

        if name_x == name_y:
            f1 = open( x + '.txt', 'r', errors="ignore")
            file_1 = f1.read()
            f1.close()
            f2 = open( y + '.txt', 'r', errors="ignore")
            file_2 = f2.read()
            f2.close()

            try: 
                if len(file_2.strip()) > len(file_1.strip()):
                    arr.remove(x + '.txt')
                else:
                    arr.remove(y + '.txt')
            except:
                pass

    return arr

for key in sources:
    arr = sources[key]
    print("Remove duplicates from", key, len(arr))
    comb = list(itertools.combinations(arr, 2))
    arr = remove_duplicates(arr, comb)
    print("Non duplicates", len(arr))
    print()

print("Remove duplicates between sources")

all_sources = []
for key in sources:
    arr = sources[key]
    all_sources += arr
    print(len(all_sources))
    comb_all = list(itertools.combinations(all_sources, 2))
    all_sources = remove_duplicates(all_sources, comb_all)
    print(len(all_sources))
    print()
print("Remove different versions of scripts with same name")


filtered = [f for f in all_sources if isfile(f) and getsize(f) > 3000]

print(len(filtered))

comb_filter = list(itertools.combinations(filtered, 2))

for (x, y) in tqdm(comb_filter):
    result = fuzz.ratio("".join(x.split(sep)[-1].split('.txt')[0].split(" ")).lower(),
                        "".join(y.split(sep)[-1].split('.txt')[0].split(" ")).lower())
    if result > 50:
        f1 = open(x, 'r', errors="ignore")
        file_1 = f1.read().replace("\n", " ").replace(
            "\t", " ").replace(" ", "").replace("-"," ")
        comp_1 = file_1[:300]
        f1.close()
        f2 = open(y, 'r', errors="ignore")
        file_2 = f2.read().replace("\n", " ").replace(
            "\t", " ").replace(" ", "").replace("-"," ")
        comp_2 = file_2[:300]
        f2.close()

        result = fuzz.ratio(comp_1, comp_2)
        if result > 80:
            try:
                if len(file_2) > len(file_1):
                    filtered.remove(x)
                else:
                    filtered.remove(y)
            except:
                pass


if not exists(DIR_FINAL):
    makedirs(DIR_FINAL)

counts = {
    'scriptsavant': 0,
    'dailyscript': 0,
    'screenplays': 0,
    'sfy': 0
}

print("Write cleaned files to new dir")
for source in tqdm(filtered):
    f = open(source, 'r', errors="ignore")
    data = f.read().strip()
    data = data.replace(
        "Script provided for educational purposes. More scripts can be found here: http://www.sellingyourscreenplay.com/library", "")
    data = data.encode('utf-8', 'ignore').decode('utf-8').strip()
    f.close()

    whitespace = re.compile(r'^[\s]+')
    punctuation = re.compile(r'['+string.punctuation+']')
    pagenumber = re.compile(
        r'^[(]?\d{1,3}[)]?[\.]?$|^.[(]?\d{1,3}[)]?[\.]?$|^[(]?\d{1,3}[)]?.?[(]?\d{1,3}[)]?[\.]?$')
    cont = re.compile(r'^\(continued\)$|^continued:$')
    allspecialchars = re.compile(r'^[^\w\s ]*$')

    lines = []

    for line in data.split('\n'):
        copy = line
        line = line.lower().strip()

        #skip lines with one char since they're likely typos
        if len(line)==1:
            if line.lower() != 'a' or line.lower() != 'i':
                continue

        #skip lines containing page numbers
        if pagenumber.match(line):
            continue
        
        if cont.match(line):
            continue

        #skip lines containing just special characters
        if line != '' and allspecialchars.match(line):
            continue
            

        lines.append(copy)
    
    final_data = '\n'.join(lines)

    if final_data.strip() == "":
        continue
    counts[source.split(sep)[-2]] += 1
    with open(join(DIR_FINAL, source.split(sep)[-1]), 'w', errors="ignore") as out:
        out.write(final_data)

print(counts)

## 9- Compare all scripts names with the top 250 movies names

In [43]:
file=os.listdir(r"scripts/final")
dfx = pd.DataFrame(file,columns=['Movie Name'])
dfx['Movie Name'] = dfx['Movie Name'].replace('.txt','', regex=True)
dfx['Movie Name'] = dfx['Movie Name'].replace('-',' ', regex=True)
dfx['Movie Name'] = dfx['Movie Name'].drop_duplicates().reset_index(drop=True)
dfx = dfx.dropna()
result = pd.merge(df, dfx, on=['Movie Name'])
result['Movie Name'] = result['Movie Name'].replace(' ','-', regex=True)
result['Movie Name'] = result['Movie Name'] + '.txt'
result = result['Movie Name'].tolist()

## 10- Create a new directory with the final top 250 movies scripts

In [41]:
files = os.listdir(source_dir)
for f in result:
    if f in file:
        shutil.move(source_dir + '\\' + f, new_directory)