# Seinfeld Script Scraper 

### URL : http://www.seinology.com/scripts.shtml

In [1]:
from lxml import html
from json import dump,loads
from requests import get
import json
import re
from dateutil import parser as dateparser
from time import sleep
import pandas as pd
from itertools import chain
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import datetime

In [2]:
def parse_sein_url(url):
    texts = ""
    response_ep = get(url)
    cleaned_response_ep = response_ep.text.replace('\x00', "")
    parser_ep = html.fromstring(cleaned_response_ep)
    return parser_ep.xpath(".//p//font//text()")

def clean_text(text):
    episode_info = {}
    series_text = pd.Series(text).str.replace("\xa0","")
    clean_text = series_text.str.replace("(\n|\t)","")
    episode_info["title"] = clean_text[0]
    episode_info["chronology"] = clean_text[1]
    episode_info["date"] = clean_text[2]
    episode_info["etc"] = clean_text[3]
    if episode_info["etc"] != '':
        if episode_info["title"] == 'Episode 45 - The Wallet (1)':
            episode_info["writer"] = clean_text[6]
            episode_info["director"] = clean_text[7]
        elif episode_info["title"] == "Originally Transcribed by unknown":
            episode_info["title"] = clean_text[2]
            episode_info["chronology"] = clean_text[3]
            episode_info["date"] = clean_text[4]
            episode_info["etc"] = ''
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
        else:
            episode_info["writer"] = clean_text[5]
            episode_info["director"] = clean_text[6]
    else:
        episode_info["writer"] = clean_text[4]
        episode_info["director"] = clean_text[5]
    return episode_info

def extract_text(text):
    series_text = pd.Series(text)
    clean_text = series_text.str.replace("(\xa0|\n|\t)","")
    clean_text = clean_text.str.replace("\x91","'")
    clean_text = clean_text.str.replace("\x92","'")
    clean_text = clean_text.str.replace("\x93",'""')
    clean_text = clean_text.str.replace("\x94",'""')
    clean_text = clean_text.str.replace("\x96",'-')
    start_script = clean_text[clean_text.str.contains("====")].index[0] + 1
    lines = clean_text.iloc[start_script:]
    clean_lines = lines[lines != ""]
    return clean_lines.tolist()

def extract_cast_lines(a_list,cast_member):
    cast_member = cast_member.upper()
    if cast_member == "KRAMER":
        return [i for i in a_list if "{0}:".format(cast_member) in i or "KESSLER:" in i]
    elif cast_member == "ETC":
        return [i for i in a_list if ("GEORGE:" not in i) and ("ELAINE:") not in i and ("JERRY:" not in i) and ("KRAMER:" not in i) and ("KESSLER:" not in i) and ("NEWMAN:" not in i)]
    else:
        return [i for i in a_list if "{0}:".format(cast_member) in i]

In [3]:
def clean_data(series):
    series = series.str.lower()
    series = series.str.replace("\n","")
    series = series.str.replace("\r","")
    series = series.str.replace("-"," ")
    series = series.str.replace("[^\w\s]"," ")
    return series

def explode(df, lst_cols, fill_value=''):
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
        
    idx_cols = df.columns.difference(lst_cols)

    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [4]:
response = get("http://www.seinology.com/scripts-english.shtml")
cleaned_response = response.text.replace('\x00', "")

parser = html.fromstring(cleaned_response)

episodes = {}
regexp = re.compile(r'\d.shtml')
for i in parser.xpath('.//a'):
    try:
        if regexp.search(i.attrib["href"]):
            if i.text == None:
                episodes[i.xpath(".//font")[0].text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
            else:
                episodes[i.text] = "http://www.seinology.com/{0}".format(i.attrib["href"])
    except:
        x = 1
        
df = pd.DataFrame(list(episodes.items()),columns=["Episode","URL"])

In [5]:
df["text"] = df["URL"].apply(parse_sein_url)
text_info = pd.DataFrame(df["text"].apply(clean_text))["text"].apply(pd.Series)

In [6]:
dat = df.join(text_info)
dat.loc[82,"writer"] = "Larry Charles"
dat.loc[81:82,"director"] = "Tom Cherones"

In [7]:
dat["lines"] = dat["text"].apply(extract_text)

In [8]:
dat["jerry_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"JERRY"))
dat["elaine_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"ELAINE"))
dat["kramer_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"KRAMER"))
dat["george_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"GEORGE"))
dat["newman_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"NEWMAN"))
dat["etc_lines"] = dat["lines"].apply(lambda x : extract_cast_lines(x,"etc"))

In [9]:
dat["date"] = dat["date"].str.replace("Broadcast date: ","")

In [10]:
dat["date"] = pd.to_datetime(dat['date'])

In [11]:
dat[["code","season","episode"]] = dat["chronology"].str.split("(season|episode[^ ]*) ",expand=True).drop([1,3],axis=1)

In [12]:
dat["code"] = dat["code"].str.replace("(pc:|,)","")

In [13]:
dat["season"] = dat["season"].str.replace("(season |,)","")

In [14]:
dat["writer"] = (dat["writer"].
                 str.replace("(Written By |Written by |Written by: )","").
                 str.replace("( and |, | / Story By | & )",",").
                 str.split(","))

In [15]:
dat["director"] = (dat["director"].str.replace("(Directed By |Directed by |Story by )","").
                 str.replace("( & |,| and |, |Part 2 Written by )",",").
                 str.split(","))

In [16]:
explode_dir = explode(dat,"director")
explode_writ = explode(dat,"writer")

In [17]:
explode_writ["writer"].str.strip().value_counts().head()

Larry David       58
Peter Mehlman     21
Larry Charles     20
Jerry Seinfeld    18
Alec Berg         14
Name: writer, dtype: int64

In [18]:
explode_dir["director"].str.strip().value_counts().head()

Andy Ackerman    84
Tom Cherones     77
Larry David       3
Peter Mehlman     3
Darin Henry       2
Name: director, dtype: int64

In [19]:
dat.loc[0,"season"] = "Pilot"

In [20]:
dat["season"] = dat["season"].str.strip()

In [21]:
dat["season"].value_counts().sort_values("index")

Pilot     1
1         4
2        12
8        22
5        22
3        23
4        24
9        24
6        24
7        24
Name: season, dtype: int64

In [22]:
index = dat["season"].value_counts().index 

In [23]:
season_lines = pd.Series(index = index)
for i in season_lines.index:
    season_lines[i] = dat[dat["season"] == "{0}".format(i)]["lines"].str.join(" <N> ").values

In [24]:
import random

def train_uni_markov_chain(scripts1):
    
    chain = {"<START>": []}
    for scripts in scripts1:
        for script in scripts:
            new_script = script.split(" ")
            chain["<START>"].append(new_script[0])
            for i in range(len(new_script)):
                if new_script[i] not in chain:
                    if i == len(new_script) - 1:
                        chain[new_script[i]] = ["<END>"]
                    else:
                        chain[new_script[i]] = [new_script[i+1]]
                else:
                    if i == len(new_script) - 1:
                        chain[new_script[i]].append("<END>")
                    else:
                        chain[new_script[i]].append(new_script[i+1])

    return chain

def generate_uni_new_script(chain):  
    
    words = []

    words.append(random.choice(chain["<START>"]))
    
    start_word = words[0]
    while start_word != "<END>":
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        start_word = next_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

def train_n_markov_chain(scripts1, n = 2):
    for scripts in scripts1:
        initial = []
        for i in range(n):
            if i == (n - 1):
                initial.append("<START>")
            else:
                initial.append(None)
        initial = tuple(initial)
        chain = {initial: []}
        for script in scripts:
            new_scripts = script.split(" ")
            chain[initial].append(new_scripts[0])
            for i in range(len(new_scripts)):
                lookup = []
                for j in range(n):
                    lookup.append(new_scripts[i - n + j + 1])
                lookup = tuple(lookup)
                if i < n - 1:
                    buffer = []
                    for j in range(n - i - 2):
                        buffer.append(None)
                    if i == 0:
                        buffer.append("<START>")
                        buffer.append(new_scripts[i])
                    else:
                        buffer = list(save[1:])
                        buffer.append(new_scripts[i])
                    buffer = tuple(buffer)
                    save = buffer
                    chain[buffer] = [new_scripts[i+1]]
                elif lookup not in chain:
                    if i == len(new_scripts) - 1:
                        chain[lookup] = ["<END>"]
                    else:
                        chain[lookup] = [new_scripts[i+1]]
                else:
                    if i == len(new_scripts) - 1:
                        chain[lookup].append("<END>")
                    else:
                        chain[lookup].append(new_scripts[i+1])

    return chain

def generate_n_new_script(chain, n = 2):
    words = []
    initial = []
    for i in range(n):
        if i == (n - 1):
            initial.append("<START>")
        else:
            initial.append(None)
    initial = tuple(initial)
    words.append(random.choice(chain[initial]))
    buffer = list(initial[1:])
    buffer.append(words[-1])
    start_word = tuple(buffer)
    counter = 0
    save = []
    while "<END>" not in start_word:
        next_word = random.choice(chain[start_word])
        words.append(next_word)
        if counter <= n - 3:
            buffer = []
            for j in range(n - counter - 3):
                buffer.append(None)
            if counter == 0:
                buffer.append("<START>")
                buffer.append(words[-2])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            else:
                buffer = list(save[1:])
                buffer.append(words[-1])
                start_word = tuple(buffer)
                save = start_word
            counter += 1
        else:
            start_word = list(save)[1:]
            start_word.append(words[-1])
            start_word = tuple(start_word)
        save = start_word

    lyrics = " ".join(words[:-1])
    return "\n".join(lyrics.split("<N>"))

In [25]:
chain = train_uni_markov_chain(season_lines.values)

In [26]:
print(generate_uni_new_script(chain))

[Opening Monologue] 
 ELAINE: Well well obviously looking at Jerry are sitting here to strap in a bet. 
 JERRY: I've never seen it. 
 ELAINE: Hmm! Oh! (hitting George) Hey, it's incredible. 
 (Mr Oh yeah, we've all those Café Latte? 
 JERRY: Oh my clothes off?" And when she's a hernia operation.. Oh so what the towel it like the moving back up. 
 MRS. C: For the first carrying bags and Helen are in being folksy..? 
 JOEL: Howard says "Elaine") Elaine.  
 HELEN: What are shut, and looks like this. I love , I can opener stuff. 
 GEORGE: It's embarrassing! (Leaves) 
 KATIE: Okay, see that?! 
 GEORGE: MOOPS 
 JERRY: For this is that? 
 JERRY: Maybe I want, men like Epstein-Barr Syndrome? 
 ELAINE: Well, I'm sure that you're suggesting, you're going to get in? 
 JERRY: Investigation?  
 GEORGE: Yeah, I'm not talking about having adventures all sides. 
 (George wanders over.) 
 KRAMER: Uh oh. Uh, next to the other apartment to beat Braun! I was your car is helping George is that is her her p

In [27]:
chain = train_n_markov_chain(season_lines.values,3)

In [28]:
print(generate_n_new_script(chain,3))

INT. COMEDY CLUB - NIGHT 
 (Jerry is on stage, peforming.) 
 JERRY: Laundry day is the only exciting day in the life of clothes. It is...y'know, think about it. The washing machine is the nightclub of clothes. You know, it's dark, there's bubbles happening, they're all kinda dancing around in there- shirt grabs the underwear, ""C'mon babe, let's go"". You come by, you open up the lid and they'll- (stiffens up, as the clothes) Socks are the most amazing article of clothing. They hate their lives, they're in the shoes with stinky feet, the boring drawers. The dryer is their only chance to escape and they all know it. They knew a escape from the dryer. A man passes by.) 
 JERRY: Well, senator, I'd just like to know, what you knew and when you knew it. 
 (A waitress, Claire, approaches the table. She pours refills Jerry's coffee.) 
 CLAIRE: Mr. Seinfeld. Mr. Costanza.  
 (Claire tries to refill George's coffee, but George blocks her.) 
 GEORGE: Are, are you sure this is decaf? Where's the 

In [29]:
dat[["season","jerry_lines"]].head()

Unnamed: 0,season,jerry_lines
0,Pilot,[JERRY: Do you know what this is all about? Do...
1,1,"[JERRY: So, I'm on line at the supermarket. Tw..."
2,1,"[JERRY: So I move into the centre lane, now I ..."
3,1,[JERRY: Most men like working on things. Tools...
4,1,[JERRY: Went out to dinner the other night. Ch...


In [30]:
index = dat["season"].value_counts().index 

In [31]:
jerry_lines = pd.Series(index = index)
for i in jerry_lines.index:
    jerry_lines[i] = dat[dat["season"] == "{0}".format(i)]["jerry_lines"].str.join(" <N> ").values

In [32]:
chain = train_uni_markov_chain(jerry_lines.values)

In [33]:
print(generate_uni_new_script(chain))

JERRY: Uh, how much more important? 
 JERRY: It's NOT DRIVING HIM TO DO. I guess. But you, very strongly about? 
 JERRY: (thinking) Great. Thanks. 
 JERRY: No, car wash, they don't you know? I'm not mannequins, they're going to his bag of a date with this thing. Somehow I feel like a little proposition for a little. 
 JERRY: Why would win one who has her all expiration dates. 
 JERRY: Well, one door [Jerry pushes the apartment she's kind of the world. Your breath is not believe this. 
 JERRY: How are you what are you on Marty nervously.) Well, if he's put anything about going by the head! 
 JERRY: Well, look. There's a party tonight. 
 JERRY: (dialing) He could've played just need that looks from his hand up a "Murphy Brown"! 
 JERRY: (watching George hasn't been driving it, you're right. You can't believe someone just wanted to do this. You sure? 
 JERRY: So we're here? 
 JERRY: I'm sorry I don't mind that my dry-cleaner? 
 JERRY: I wanna barrow Golden Boy's son Baby Junior. Yeah, I 

In [34]:
chain = train_n_markov_chain(jerry_lines.values,3)

In [35]:
print(generate_n_new_script(chain,3))

JERRY: Do you know what this is all about? Do you know, why we're here? To be out, this is out...and out is one of the amazing things to me...I have no cotton-balls, we're all human beings, what is the story? I've never had one...I never bought one, I never needed one, I've never been in a situation, when I thought to myself: ""I could use a cotton-ball right now."" I can certainly get out of this mess. Women need them and they don't need one or two, they need thousands of them, they need bags, they're like peat moss bags, have you ever seen the big trucks out on the street? Yeah, no problem. 
 JERRY: (upset) Ohhhh, what are you doing? Kessler, it's a tape! I taped the game, it's one o'clock in the morning! I avoided human contact all night to watch this. 
 JERRY: Meat? I don't, I don't know, go... hunt! (Kessler opens the refrigerator and sticks his head in.) Well what happened in the game anyway? 
 JERRY: (cynical) This is the one I'm- always has been one of the single most enjoyable

In [36]:
person_lines = [dat["jerry_lines"],dat["elaine_lines"],dat["george_lines"],dat["newman_lines"],dat["etc_lines"]]

In [37]:
each_line = pd.DataFrame(pd.concat(person_lines,ignore_index=True),columns=["lines"])

In [38]:
exploded_lines = explode(each_line,"lines").reset_index().drop("index",axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [85]:
full_lines = exploded_lines[exploded_lines["lines"] != ""]

In [86]:
full_lines["speaker"] = full_lines.loc[:,"lines"].str.extract("^(.*?)\:")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
def get_val_scores(k, c = "", score="f1"):
    model = KNeighborsClassifier(n_neighbors=k,metric='euclidean')
    pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])
    if score == "accuracy":
        return cross_val_score(pipeline, full_lines["lines"], y_train, 
                cv=10, scoring="accuracy").mean()
    else:
        return cross_val_score(pipeline, full_lines["lines"], y_train == c, 
                cv=10, scoring= score).mean()

In [None]:
"""vec = TfidfVectorizer(norm = None,max_features = 50)
scaler = Normalizer()

y_train = full_lines["speaker"]

k = pd.Series(range(1,15))
k.index = range(1,15)

    
val_f1 = k.apply(lambda x: get_val_scores(x,
                                          c = "Jerry",
                                          score="f1"))"""

In [None]:
#val_f1.plot.line()

In [None]:
"""get_val_scores(1,
                c = "Jerry",
                score="f1")"""

In [87]:
full_lines.loc[~(full_lines["speaker"].str.contains("JERRY") | full_lines["speaker"].str.contains("ELAINE") | 
    full_lines["speaker"].str.contains("KRAMER") | full_lines["speaker"].str.contains("GEORGE") | 
    full_lines["speaker"].str.contains("NEWMAN") | full_lines["speaker"].str.contains("JERRY") | 
    full_lines["speaker"].str.contains("FRANK")),["speaker"]] = "ETC"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [88]:
full_lines["speaker"] = (full_lines["speaker"].str.replace("'.*$","").str.replace("&.*$","").
 str.replace("\(.*$","").str.replace("\[.*$","").str.split("(/|\+|&|and|AND|,)",expand=True).
 apply(lambda x : list(x.dropna().astype(str)),axis = 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [89]:
full_lines = explode(full_lines,"speaker")

In [90]:
full_lines["speaker"] = full_lines["speaker"].str.strip()

In [99]:
full_lines = full_lines[~((full_lines["speaker"] == "/") | (full_lines["speaker"] == "+") |
(full_lines["speaker"] == "&") | (full_lines["speaker"] == "and" ) |
(full_lines["speaker"] == "AND") | (full_lines["speaker"] == ",") | (full_lines["speaker"] == ""))]


In [116]:
full_lines.loc[~((full_lines["speaker"] == "JERRY") | (full_lines["speaker"] =="ELAINE") | 
    (full_lines["speaker"] == "KRAMER") | (full_lines["speaker"] =="GEORGE") | 
    (full_lines["speaker"] =="NEWMAN") | (full_lines["speaker"] == "JERRY") | 
    (full_lines["speaker"] =="FRANK")),["speaker"]] = "ETC"

In [123]:
from sklearn.model_selection import cross_val_predict

In [127]:
y_train = full_lines["speaker"]

vec = TfidfVectorizer(norm = None)
scaler = Normalizer()
model = KNeighborsClassifier(n_neighbors=4,metric='euclidean')
pipeline = Pipeline([("vectorizer",vec),("scaler",scaler),("fit",model)])

predictions = cross_val_predict(pipeline, X =  full_lines["lines"], 
                                          y = y_train, 
                cv=10)

In [128]:
pd.Series(predictions)

0           ETC
1         JERRY
2           ETC
3           ETC
4        GEORGE
5           ETC
6           ETC
7           ETC
8           ETC
9           ETC
10          ETC
11          ETC
12        JERRY
13        JERRY
14        JERRY
15        JERRY
16        JERRY
17        JERRY
18          ETC
19        JERRY
20          ETC
21        JERRY
22          ETC
23        JERRY
24          ETC
25          ETC
26          ETC
27        JERRY
28        JERRY
29        JERRY
          ...  
60362     JERRY
60363       ETC
60364       ETC
60365       ETC
60366       ETC
60367       ETC
60368       ETC
60369       ETC
60370       ETC
60371       ETC
60372       ETC
60373       ETC
60374       ETC
60375       ETC
60376       ETC
60377       ETC
60378       ETC
60379       ETC
60380       ETC
60381     JERRY
60382       ETC
60383       ETC
60384       ETC
60385       ETC
60386       ETC
60387       ETC
60388       ETC
60389       ETC
60390     JERRY
60391       ETC
Length: 60392, dtype: ob