# Seinfeld Script Scraper 

### URL : http://www.seinology.com/scripts.shtml

In [1]:
from lxml import html
from json import dump,loads
from requests import get
import json
import re
from dateutil import parser as dateparser
from time import sleep
import pandas as pd
from itertools import chain
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import datetime

In [2]:
def parse_sein_url(url):
    texts = ""
    response_ep = get(url)
    cleaned_response_ep = response_ep.text.replace('\x00', "")
    parser_ep = html.fromstring(cleaned_response_ep)
    return parser_ep.xpath(".//p//font//text()")

def clean_text(text):
    episode_info = {}
    series_text = pd.Series(text).str.replace("\xa0","")
    clean_text = series_text.str.replace("(\n|\t)","")
    episode_info["title"] = clean_text[0]
    episode_info["chronology"] = clean_text[1]
    episode_info["date"] = clean_text[2]
    episode_info["etc"] = clean_text[3]
    if episode_info["etc"] != '':
        if episode_info["title"] == 'Episode 45 - The Wallet (1)':
            episode_info["writer"] = clean_text[6]
            episode_info["director"] = clean_text[7]
        elif episode_info["title"] == "Originally Transcribed by unknown":
            episode_info["title"] = clean_text[2]
            episode_info["chronology"] = clean_text[3]
            episode_info["date"] = clean_text[4]
            episode_info["etc"] = ''
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
        else:
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
    else:
        episode_info["writer"] = clean_text[4]
        episode_info["director"] = clean_text[5]
    return episode_info

def extract_text(text):
    series_text = pd.Series(text)
    clean_text = series_text.str.replace("(\xa0|\n|\t)","")
    clean_text = clean_text.str.replace("\x91","'")
    clean_text = clean_text.str.replace("\x92","'")
    clean_text = clean_text.str.replace("\x93",'""')
    clean_text = clean_text.str.replace("\x94",'""')
    clean_text = clean_text.str.replace("\x96",'-')
    start_script = clean_text[clean_text.str.contains("====")].index[0] + 1
    lines = clean_text.iloc[start_script:]
    clean_lines = lines[lines != ""]
    return clean_lines.tolist()

def extract_cast_lines(a_list,cast_member):
    cast_member = cast_member.upper()
    if cast_member == "KRAMER":
        return [i for i in a_list if "{0}:".format(cast_member) in i or "KESSLER:" in i]
    elif cast_member == "ETC":
        return [i for i in a_list if ("GEORGE:" not in i) and ("ELAINE:") not in i and ("JERRY:" not in i) and ("KRAMER:" not in i) and ("KESSLER:" not in i) and ("NEWMAN:" not in i)]
    else:
        return [i for i in a_list if "{0}:".format(cast_member) in i]

In [3]:
def clean_data(series):
    series = series.str.lower()
    series = series.str.replace("\n","")
    series = series.str.replace("\r","")
    series = series.str.replace("-"," ")
    series = series.str.replace("[^\w\s]"," ")
    return series

def explode(df, lst_cols, fill_value=''):
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
        
    idx_cols = df.columns.difference(lst_cols)

    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [4]:
response = get("http://www.seinology.com/scripts-english.shtml")
cleaned_response = response.text.replace('\x00', "")

parser = html.fromstring(cleaned_response)

episodes = {}
regexp = re.compile(r'\d.shtml')
for i in parser.xpath('.//a'):
    try:
        if regexp.search(i.attrib["href"]):
            if i.text == None:
                episodes[i.xpath(".//font")[0].text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
            else:
                episodes[i.text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
    except:
        x = 1
        
df = pd.DataFrame(list(episodes.items()),columns=["Episode","URL"])

In [5]:
df["text"] = df["URL"].apply(parse_sein_url)
text_info = pd.DataFrame(df["text"].apply(clean_text))["text"].apply(pd.Series)

In [6]:
dat = df.join(text_info)
dat.loc[82,"writer"] = "Larry Charles"
dat.loc[81:82,"director"] = "Tom Cherones"

In [7]:
dat["lines"] = dat["text"].apply(extract_text)

In [8]:
dat["jerry_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"JERRY"))
dat["elaine_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"ELAINE"))
dat["kramer_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"KRAMER"))
dat["george_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"GEORGE"))
dat["newman_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"NEWMAN"))
dat["etc_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"etc"))

In [9]:
dat["date"] = dat["date"].str.replace("Broadcast date: ","")

In [10]:
dat["date"] = pd.to_datetime(dat['date'])

In [11]:
dat[["code","season","episode"]] = dat["chronology"].str.split("(season|episode[^ ]*) ",expand=True).drop([1,3],axis=1)

In [12]:
dat["code"] = dat["code"].str.replace("(pc:|,)","")

In [13]:
dat["season"] = dat["season"].str.replace("(season |,)","")

In [14]:
dat["writer"] = (dat["writer"].
                 str.replace("(Written By |Written by |Written by: )","").
                 str.replace("( and |, | / Story By | & )",",").
                 str.split(","))

In [15]:
dat["director"] = (dat["director"].str.replace("(Directed By |Directed by |Story by )","").
                 str.replace("( & |,| and |, |Part 2 Written by )",",").
                 str.split(","))

In [16]:
explode_dir = explode(dat,"director")
explode_writ = explode(dat,"writer")

In [17]:
explode_writ["writer"].str.strip().value_counts().head()

Larry David       58
Peter Mehlman     21
Larry Charles     20
Jerry Seinfeld    18
Alec Berg         14
Name: writer, dtype: int64

In [18]:
explode_dir["director"].str.strip().value_counts().head()

Andy Ackerman    84
Tom Cherones     77
Peter Mehlman     3
Larry David       3
Darin Henry       2
Name: director, dtype: int64

In [19]:
dat.loc[0,"season"] = "Pilot"

In [20]:
dat["season"] = dat["season"].str.strip()

In [21]:
dat["season"].value_counts().sort_values("index")

Pilot     1
1         4
2        12
5        22
8        22
3        23
7        24
4        24
9        24
6        24
Name: season, dtype: int64

In [22]:
index = dat["season"].value_counts().index 

In [23]:
season_lines = pd.Series(index = index)
for i in season_lines.index:
    season_lines[i] = " <N> ".join([item for sublist in dat[dat["season"] == "{0}".
                                                format(i)]["lines"].values for item in sublist])

In [29]:
import random

def train_uni_markov_chain(scripts):
    
    chain = {"<START>": []}
    for script in scripts:
        new_script = script.split(" ")
        chain["<START>"].append(new_script[0])
        for i in range(len(new_script)):
            if new_script[i] not in chain:
                if i == len(new_script) - 1:
                    chain[new_script[i]] = ["<END>"]
                else:
                    chain[new_script[i]] = [new_script[i+1]]
            else:
                if i == len(new_script) - 1:
                    chain[new_script[i]].append("<END>")
                else:
                    chain[new_script[i]].append(new_script[i+1])
        
    return chain

def generate_uni_new_script(chain):  
    
    words = []

    words.append(random.choice(chain["<START>"]))
    
    start_word = words[0]
    while start_word != "<END>":
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        start_word = next_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

def train_n_markov_chain(scripts, n = 2):
    initial = []
    for i in range(n):
        if i == (n - 1):
            initial.append("<START>")
        else:
            initial.append(None)
    initial = tuple(initial)
    chain = {initial: []}
    for script in scripts:
        new_scripts = script.split(" ")
        chain[initial].append(new_scripts[0])
        for i in range(len(new_scripts)):
            lookup = []
            for j in range(n):
                lookup.append(new_scripts[i - n + j + 1])
            lookup = tuple(lookup)
            if i < n - 1:
                buffer = []
                for j in range(n - i - 2):
                    buffer.append(None)
                if i == 0:
                    buffer.append("<START>")
                    buffer.append(new_scripts[i])
                else:
                    buffer = list(save[1:])
                    buffer.append(new_scripts[i])
                buffer = tuple(buffer)
                save = buffer
                chain[buffer] = [new_scripts[i+1]]
            elif lookup not in chain:
                if i == len(new_scripts) - 1:
                    chain[lookup] = ["<END>"]
                else:
                    chain[lookup] = [new_scripts[i+1]]
            else:
                if i == len(new_scripts) - 1:
                    chain[lookup].append("<END>")
                else:
                    chain[lookup].append(new_scripts[i+1])

    return chain

def generate_n_new_script(chain, n = 2):
    words = []
    initial = []
    for i in range(n):
        if i == (n - 1):
            initial.append("<START>")
        else:
            initial.append(None)
    initial = tuple(initial)
    words.append(random.choice(chain[initial]))
    buffer = list(initial[1:])
    buffer.append(words[-1])
    start_word = tuple(buffer)
    counter = 0
    save = []
    while "<END>" not in start_word:
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        if counter <= n - 3:
            buffer = []
            for j in range(n - counter - 3):
                buffer.append(None)
            if counter == 0:
                buffer.append("<START>")
                buffer.append(words[-2])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            else:
                buffer = list(save[1:])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            counter += 1
        else:
            start_word = list(save)[1:]
            start_word.append(words[-1])
            start_word = tuple(start_word)
        save = start_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

In [25]:
chain = train_uni_markov_chain(season_lines.values)

In [26]:
print(generate_uni_new_lyrics(chain))

[Opening monologue] 
 GEORGE: Miss Benes . . ) 
 GEORGE: Hey, I wasn't in my stories about that license plate of nothing wrong with them) 
 KRAMER: Oh? 
 KRAMER: Jerry, I cannot believe I was the phone): I still don't tell you to go back seat, I wouldn't have the limit. We're out milk glass, everybody's your shoulder. Jerry stands outside the best. Look. Look. If anyone off the whole thing. You're lucky I didn't get him now. I *whap* let me see him. [clicks over] Hello? Hello, Susan? 
 JERRY: I did they have got there and sitting across the other side pocket and I can you want to go! 
 JACKIE: My cousin's in trouble? 
 ELAINE: Oh, just ran in the feeling through his room, knocking over here! 
 JERRY: Look! Look at six-thirty. The place a Latin phrase George, Elaine and Jerry sits on a little high, it's so sorry. There's something like to meet me to act casual) 
 GEORGE: Right, anyone she like? 
 ELAINE: Are you get it, uh. And you do? 
 GEORGE: A pack of my office. Elaine has a few wee

In [27]:
chain = train_n_markov_chain(season_lines.values,3)

In [28]:
print(generate_n_new_lyrics(chain,3))

[The cemetery. Jerry, George, and the Rosses are standing in front of Jerry's door)  
 ENZO: Hey, what's all of this?  
 JERRY: Hi. 
 STU: Could we speak for a few seconds.) 
 GEORGE: Dad. 
 FRANK: What do you want to be the big new look in men's fashion.. It's a, a puffy shirt. 
 JERRY: Well, you want to know his name? 
 NEWMAN: Tim Whatley. 
 JERRY: What? 
 GEORGE: I never assisted in a birth before. It's really quite disgusting. 
 JERRY: What about little Joey? 
 GEORGE: Who? 
 JERRY: Ah, she's with her new boyfriend Tony  
 ELAINE: yeah, all right ok  
 JERRY: Me too.  
 JERRY: it can't be  
 KRAMER: Would you stop it. (Susan playfully bites his ear lobe again) Would you quit it please. Someone is going to be on all night. I don't like it. 
 ELAINE: Oh, yeah. 
 ELAINE: Yep. 
 JERRY: Why? 
 GEORGE: Why? Is there anything else about you I should know? 
 JERRY: You know who - isn't here. He was in Chicago, the flight was delayed, how long've you been waiting? 
 GEORGE: I didn't do any