# Seinfeld Script Scraper 

### URL : http://www.seinology.com/scripts.shtml

In [1]:
from lxml import html
from json import dump,loads
from requests import get
import json
import re
from dateutil import parser as dateparser
from time import sleep
import pandas as pd
from itertools import chain
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import datetime

In [2]:
def parse_sein_url(url):
    texts = ""
    response_ep = get(url)
    cleaned_response_ep = response_ep.text.replace('\x00', "")
    parser_ep = html.fromstring(cleaned_response_ep)
    return parser_ep.xpath(".//p//font//text()")

def clean_text(text):
    episode_info = {}
    series_text = pd.Series(text).str.replace("\xa0","")
    clean_text = series_text.str.replace("(\n|\t)","")
    episode_info["title"] = clean_text[0]
    episode_info["chronology"] = clean_text[1]
    episode_info["date"] = clean_text[2]
    episode_info["etc"] = clean_text[3]
    if episode_info["etc"] != '':
        if episode_info["title"] == 'Episode 45 - The Wallet (1)':
            episode_info["writer"] = clean_text[6]
            episode_info["director"] = clean_text[7]
        elif episode_info["title"] == "Originally Transcribed by unknown":
            episode_info["title"] = clean_text[2]
            episode_info["chronology"] = clean_text[3]
            episode_info["date"] = clean_text[4]
            episode_info["etc"] = ''
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
        else:
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
    else:
        episode_info["writer"] = clean_text[4]
        episode_info["director"] = clean_text[5]
    return episode_info

def extract_text(text):
    series_text = pd.Series(text)
    clean_text = series_text.str.replace("(\xa0|\n|\t)","")
    clean_text = clean_text.str.replace("\x91","'")
    clean_text = clean_text.str.replace("\x92","'")
    clean_text = clean_text.str.replace("\x93",'""')
    clean_text = clean_text.str.replace("\x94",'""')
    clean_text = clean_text.str.replace("\x96",'-')
    start_script = clean_text[clean_text.str.contains("====")].index[0] + 1
    lines = clean_text.iloc[start_script:]
    clean_lines = lines[lines != ""]
    return clean_lines.tolist()

def extract_cast_lines(a_list,cast_member):
    cast_member = cast_member.upper()
    if cast_member == "KRAMER":
        return [i for i in a_list if "{0}:".format(cast_member) in i or "KESSLER:" in i]
    elif cast_member == "ETC":
        return [i for i in a_list if ("GEORGE:" not in i) and ("ELAINE:") not in i and ("JERRY:" not in i) and ("KRAMER:" not in i) and ("KESSLER:" not in i) and ("NEWMAN:" not in i)]
    else:
        return [i for i in a_list if "{0}:".format(cast_member) in i]

In [3]:
def clean_data(series):
    series = series.str.lower()
    series = series.str.replace("\n","")
    series = series.str.replace("\r","")
    series = series.str.replace("-"," ")
    series = series.str.replace("[^\w\s]"," ")
    return series

def explode(df, lst_cols, fill_value=''):
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
        
    idx_cols = df.columns.difference(lst_cols)

    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [4]:
response = get("http://www.seinology.com/scripts-english.shtml")
cleaned_response = response.text.replace('\x00', "")

parser = html.fromstring(cleaned_response)

episodes = {}
regexp = re.compile(r'\d.shtml')
for i in parser.xpath('.//a'):
    try:
        if regexp.search(i.attrib["href"]):
            if i.text == None:
                episodes[i.xpath(".//font")[0].text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
            else:
                episodes[i.text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
    except:
        x = 1
        
df = pd.DataFrame(list(episodes.items()),columns=["Episode","URL"])

In [5]:
df["text"] = df["URL"].apply(parse_sein_url)
text_info = pd.DataFrame(df["text"].apply(clean_text))["text"].apply(pd.Series)

In [6]:
dat = df.join(text_info)
dat.loc[82,"writer"] = "Larry Charles"
dat.loc[81:82,"director"] = "Tom Cherones"

In [7]:
dat["lines"] = dat["text"].apply(extract_text)

In [8]:
dat["jerry_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"JERRY"))
dat["elaine_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"ELAINE"))
dat["kramer_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"KRAMER"))
dat["george_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"GEORGE"))
dat["newman_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"NEWMAN"))
dat["etc_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"etc"))

In [9]:
dat["date"] = dat["date"].str.replace("Broadcast date: ","")

In [10]:
dat["date"] = pd.to_datetime(dat['date'])

In [11]:
dat[["code","season","episode"]] = dat["chronology"].str.split("(season|episode[^ ]*) ",expand=True).drop([1,3],axis=1)

In [12]:
dat["code"] = dat["code"].str.replace("(pc:|,)","")

In [13]:
dat["season"] = dat["season"].str.replace("(season |,)","")

In [14]:
dat["writer"] = (dat["writer"].
                 str.replace("(Written By |Written by |Written by: )","").
                 str.replace("( and |, | / Story By | & )",",").
                 str.split(","))

In [15]:
dat["director"] = (dat["director"].str.replace("(Directed By |Directed by |Story by )","").
                 str.replace("( & |,| and |, |Part 2 Written by )",",").
                 str.split(","))

In [16]:
explode_dir = explode(dat,"director")
explode_writ = explode(dat,"writer")

In [17]:
explode_writ["writer"].str.strip().value_counts().head()

Larry David       58
Peter Mehlman     21
Larry Charles     20
Jerry Seinfeld    18
Alec Berg         14
Name: writer, dtype: int64

In [18]:
explode_dir["director"].str.strip().value_counts().head()

Andy Ackerman    84
Tom Cherones     77
Peter Mehlman     3
Larry David       3
Gregg Kavet       2
Name: director, dtype: int64

In [19]:
dat.loc[0,"season"] = "Pilot"

In [20]:
dat["season"] = dat["season"].str.strip()

In [21]:
dat["season"].value_counts().sort_values("index")

Pilot     1
1         4
2        12
8        22
5        22
3        23
9        24
7        24
4        24
6        24
Name: season, dtype: int64

In [37]:
index = dat["season"].value_counts().index 

In [52]:
season_lines = pd.Series(index = index)
for i in season_lines.index:
    season_lines[i] = dat[dat["season"] == "{0}".format(i)]["lines"].str.join(" <N> ").values

In [61]:
import random

def train_uni_markov_chain(scripts1):
    
    chain = {"<START>": []}
    for scripts in scripts1:
        for script in scripts:
            new_script = script.split(" ")
            chain["<START>"].append(new_script[0])
            for i in range(len(new_script)):
                if new_script[i] not in chain:
                    if i == len(new_script) - 1:
                        chain[new_script[i]] = ["<END>"]
                    else:
                        chain[new_script[i]] = [new_script[i+1]]
                else:
                    if i == len(new_script) - 1:
                        chain[new_script[i]].append("<END>")
                    else:
                        chain[new_script[i]].append(new_script[i+1])

    return chain

def generate_uni_new_script(chain):  
    
    words = []

    words.append(random.choice(chain["<START>"]))
    
    start_word = words[0]
    while start_word != "<END>":
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        start_word = next_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

def train_n_markov_chain(scripts1, n = 2):
    for scripts in scripts1:
        initial = []
        for i in range(n):
            if i == (n - 1):
                initial.append("<START>")
            else:
                initial.append(None)
        initial = tuple(initial)
        chain = {initial: []}
        for script in scripts:
            new_scripts = script.split(" ")
            chain[initial].append(new_scripts[0])
            for i in range(len(new_scripts)):
                lookup = []
                for j in range(n):
                    lookup.append(new_scripts[i - n + j + 1])
                lookup = tuple(lookup)
                if i < n - 1:
                    buffer = []
                    for j in range(n - i - 2):
                        buffer.append(None)
                    if i == 0:
                        buffer.append("<START>")
                        buffer.append(new_scripts[i])
                    else:
                        buffer = list(save[1:])
                        buffer.append(new_scripts[i])
                    buffer = tuple(buffer)
                    save = buffer
                    chain[buffer] = [new_scripts[i+1]]
                elif lookup not in chain:
                    if i == len(new_scripts) - 1:
                        chain[lookup] = ["<END>"]
                    else:
                        chain[lookup] = [new_scripts[i+1]]
                else:
                    if i == len(new_scripts) - 1:
                        chain[lookup].append("<END>")
                    else:
                        chain[lookup].append(new_scripts[i+1])

    return chain

def generate_n_new_script(chain, n = 2):
    words = []
    initial = []
    for i in range(n):
        if i == (n - 1):
            initial.append("<START>")
        else:
            initial.append(None)
    initial = tuple(initial)
    words.append(random.choice(chain[initial]))
    buffer = list(initial[1:])
    buffer.append(words[-1])
    start_word = tuple(buffer)
    counter = 0
    save = []
    while "<END>" not in start_word:
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        if counter <= n - 3:
            buffer = []
            for j in range(n - counter - 3):
                buffer.append(None)
            if counter == 0:
                buffer.append("<START>")
                buffer.append(words[-2])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            else:
                buffer = list(save[1:])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            counter += 1
        else:
            start_word = list(save)[1:]
            start_word.append(words[-1])
            start_word = tuple(start_word)
        save = start_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

In [64]:
chain = train_uni_markov_chain(season_lines.values)

In [65]:
print(generate_uni_new_script(chain))

[Waitress giving them walk around here if you can't leave my town.  
 GEORGE: The president of Jerry's apartment) 
 (Jerry looks down there contemplating what you go with Tammy? 
 GEORGE: Where you on stage, performing.) 
 [George and go on Jerry's tiles to open and Elaine and that he's not gonna score tied up. What strike? They make who she sent a solution to the middle, . Kramer. 
 GAIL: It's your total disregard. It's deja vu all over to keep an adoption agency.  
 SALES ASSOCIATE: I got your socks, and takes a close to me a pen? We had to Elaine) Want me, Newman, so funny 
 SUSAN: I've come on. Come on. 
 JERRY: Oh I mean, it's all squared away. He even by and Kramer.] 
 JERRY: oh.. whoa..  
 (Scene ends) 
 ELAINE: He stumbles into the dinner with the background, a sandwich, I'll have never met an operation. They're all the spirit of the door) That is measuring a salad, when you around here tomorrow at the sound "mmm") then winces under his arms about that? 
 ELAINE: Hey! Hey! Come

In [66]:
chain = train_n_markov_chain(season_lines.values,3)

In [68]:
print(generate_n_new_script(chain,3))

INT. COMEDY CLUB - NIGHT 
 (Jerry is on the couch, takes off her shoes and opens some buttons of her shirt.) So uh, can I get you anything? Uuhhh, bread, water...salad-dressing? 
 LAURA: (laughs) Actually, um, do you have any wine? 
 JERRY: Uh, no, yeah, go right ahead. 
 (She turns down the lamp.) 
 LAURA: Uh, Jerry, uh, I was wandering, would it be possible - and if it's not, fine - for me to stay here tomorrow night too? 
 (Jerry takes off his shoes to make himself comfortable) 
 JERRY: Uh, yeah, yeah, sure, why don't you strip down and meet some of the people you'll be workin' with?""  
 INT. AIRPORT - NIGHT 
 (Jerry is on stage, performing.) 
 JERRY: Do you know that I was almost... a lawyer. 
 (Kessler shows with his fingers how close he was.) 
 JERRY: That close, huh? 
 KESSLER: You know, I almost wound up going to that game. 
 JERRY: (cynical) This is the signal? Thank you, Mr. Signal. Where were you yesterday? 
 GEORGE: I see, well, that's open to interpretation. Because so mu

In [33]:
dat[["season","jerry_lines"]].head()

Unnamed: 0,season,jerry_lines
0,Pilot,[JERRY: Do you know what this is all about? Do...
1,1,"[JERRY: So, I'm on line at the supermarket. Tw..."
2,1,"[JERRY: So I move into the centre lane, now I ..."
3,1,[JERRY: Most men like working on things. Tools...
4,1,[JERRY: Went out to dinner the other night. Ch...


In [34]:
index = dat["season"].value_counts().index 

In [69]:
jerry_lines = pd.Series(index = index)
for i in jerry_lines.index:
    jerry_lines[i] = dat[dat["season"] == "{0}".format(i)]["jerry_lines"].str.join(" <N> ").values

In [71]:
chain = train_uni_markov_chain(jerry_lines.values)

In [72]:
print(generate_uni_new_script(chain))

JERRY: How you gonna get a riddle. 
 JERRY: I figure out of fact. I think the Hospital. I take over here. You what? 
 JERRY: I would be real smart guy who runs out his walking away from Dr. Stevens?"" ""Oh, hi Stu. 
 JERRY: Hey, it's just telling me. It was around the IRS. And there's nothing to say, say 'thank you' card from them in the stomach for insurance doesn't matter with bloodstains all the same shoes with one of that either. 
 JERRY: I'm just want to happiness of an appointment for your shirt. 
 JERRY: Elaine, I sure what I dunno... 
 JERRY: He's hostile, he's black. 
 JERRY: Excuse me, if he get it someplace else. Family, friends, you have to guard stops calling. (George enters.) 
 JERRY: Ah. But you think! It was just doesn't he might have any of his blood isn't too much more powerful one of actually started screaming: "There he out on the keys.) I have any way I gotta get upset that stuff? 
 JERRY: He can just get this is dead? 
 JERRY:Alright, alright, I never thought they

In [73]:
chain = train_n_markov_chain(jerry_lines.values,3)

In [74]:
print(generate_n_new_script(chain,3))

JERRY: Do you know what this is all about? Do you know, why we're here? To be out, this is out...and out is one of the amazing things to me...I have no cotton-balls, we're all human beings, what is the story? I've never had one...I never bought one, I never needed one, I've never been in a situation, when I thought to myself: ""I could use a cotton-ball right now."" I can certainly get out of this mess. Women need them and they don't need one or two, they need thousands of them, they need bags, they're like peat moss bags, have you ever seen the big trucks out on the street? Yeah, no problem. 
 JERRY: (upset) Ohhhh, what are you doing? Kessler, it's a tape! I taped the game, it's one o'clock in the morning! I avoided human contact all night to watch this. 
 JERRY: Meat? I don't, I don't know, go... hunt! (Kessler opens the refrigerator and sticks his head in.) Well what happened in the Met game, don't say anything, I taped it, hello. Yeah, no, I'm sorry, you have the wrong number. Yeah

In [80]:
dat["lines"][0].str.ext

['INT. COMEDY CLUB - NIGHT',
 '(Jerry is on stage, performing.)',
 'JERRY: Do you know what this is all about? Do you know, why we\'re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about ""We should go out""? This is what they\'re talking about...this whole thing, we\'re all out now, no one is home. Not one person here is home, we\'re all out! There are people tryin\' to find us, they don\'t know where we are. (on an imaginary phone) ""Did you ring?, I can\'t find him."" ""Where did he go?"" ""He didn\'t tell me where he was going"". He must have gone out. You wanna go out: you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you\'re standing around, whatta you do? You go: ""We gotta be getting back"". Once you\'re out, you wanna get back! You wanna go to sleep, you wanna get up, you wann

In [85]:
dat["lines"][0]

['INT. COMEDY CLUB - NIGHT',
 '(Jerry is on stage, performing.)',
 'JERRY: Do you know what this is all about? Do you know, why we\'re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about ""We should go out""? This is what they\'re talking about...this whole thing, we\'re all out now, no one is home. Not one person here is home, we\'re all out! There are people tryin\' to find us, they don\'t know where we are. (on an imaginary phone) ""Did you ring?, I can\'t find him."" ""Where did he go?"" ""He didn\'t tell me where he was going"". He must have gone out. You wanna go out: you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you\'re standing around, whatta you do? You go: ""We gotta be getting back"". Once you\'re out, you wanna get back! You wanna go to sleep, you wanna get up, you wann