# Natural Language Processing (NLP) Process For Seinfeld Transcripts

This notebook will outline the process of cleaning, tokenizing, and vectorizing text transcripts of Seinfeld Season 5 Episodes. Source of transcripts: https://www.seinfeldscripts.com/seinfeld-scripts.html

In [56]:
#Import the NLTK library, tokenizer, and methods
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

#Import visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

import re

[nltk_data] Downloading package punkt to /Users/Alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read The Files

In [57]:
seinfeld_directory = 'Seinfeld_Episodes/Season_5/'

seinfeld_season_5_episodes = ['S05_E01_The_Mango.txt', 'S05_E02_The_Puffy_Shirt.txt',
                              'S05_E03_The_Glasses.txt', 'S05_E04_The_Sniffing_Accountant.txt',
                              'S05_E05_The_Bris.txt', 'S05_E06_The_Lip_Reader.txt',
                              'S05_E07_The_Non_Fat_Yogurt.txt', 'S05_E08_The_Barber.txt',
                              'S05_E09_The_Masseuse.txt', 'S05_E10_The_Cigar_Store_Indian.txt',
                              'S05_E11_The_Conversion.txt', 'S05_E12_The_Stall.txt',
                              'S05_E13_The_Dinner_Party.txt', 'S05_E14_The_Marine_Biologist.txt',
                              'S05_E15_The_Pie.txt', 'S05_E16_The_Stand-In.txt',
                              'S05_E17_The_Wife.txt', 'S05_E18_The_Raincoats_Part_1.txt',
                              'S05_E19_The_Raincoats_Part_2.txt', 'S05_E20_The_Fire.txt',
                              'S05_E21_The_Hamptons.txt', 'S05_E22_The Opposite.txt']


In [58]:
with open(seinfeld_directory + seinfeld_season_5_episodes[1], 'r') as file:
    raw_text_episode_2 = file.read().replace('\n', ' ')
    #.replace('[','(').replace(']',')')

In [59]:
raw_text_episode_2[:1050]

"[Setting: Jerry's apartment] (Jerry and George are waiting for Kramer, so he can help them move George's stuff back into his parent's house) GEORGE: I can't believe this! JERRY: Oh, it won't be for that long. GEORGE: How can I do this?! How can I move back in with those people? Please, tell me! They're insane! You know that. JERRY: Hey, my parents are just as crazy as your parents. GEORGE: How can you compare you parents to my parents?! JERRY: My father has never thrown anything out. Ever! GEORGE: My father wears his sneakers in the pool! Sneakers! JERRY: My mother has never set foot in a natural body of water. GEORGE: (Showing Jerry up) Listen carefully. My mother has never laughed. Ever. Not a giggle, not a chuckle, not a tee-hee.. never went 'Ha!' JERRY: A smirk? GEORGE: Maybe!.. And I'm moving back in there! JERRY: I told you I'd lend you the money for the rent. GEORGE: No, no, no, no. Borrowing money from a friend is like having sex. It just completely changes the relationship. (

## Cleaning The Text Data

#Step 1: Manual

In [62]:
raw_text_quotes = re.sub("[\(\[].*?[\)\]]", "", raw_text_episode_2)
raw_text_quotes

'  GEORGE: I can\'t believe this! JERRY: Oh, it won\'t be for that long. GEORGE: How can I do this?! How can I move back in with those people? Please, tell me! They\'re insane! You know that. JERRY: Hey, my parents are just as crazy as your parents. GEORGE: How can you compare you parents to my parents?! JERRY: My father has never thrown anything out. Ever! GEORGE: My father wears his sneakers in the pool! Sneakers! JERRY: My mother has never set foot in a natural body of water. GEORGE:  Listen carefully. My mother has never laughed. Ever. Not a giggle, not a chuckle, not a tee-hee.. never went \'Ha!\' JERRY: A smirk? GEORGE: Maybe!.. And I\'m moving back in there! JERRY: I told you I\'d lend you the money for the rent. GEORGE: No, no, no, no. Borrowing money from a friend is like having sex. It just completely changes the relationship.  KRAMER: Alright. I\'m ready.  You know, I still don\'t understand - why do you want to move back in with your parents? GEORGE: I don\'t want to! I\'m 

In [65]:
def clean_text(raw_text):
    raw_text_quotes = re.sub("[\(\[].*?[\)\]]", "", raw_text)
    cleaned_text = []
    for word in raw_text_quotes.split(" "):
        if not ":" in word:
            for symbol in ".,?!'":
                word = word.replace(symbol, '').lower()
            #Checks for blank elements
            if word:
                cleaned_text.append(word)

    return cleaned_text

In [66]:
clean_text(raw_text_episode_2)

['i',
 'cant',
 'believe',
 'this',
 'oh',
 'it',
 'wont',
 'be',
 'for',
 'that',
 'long',
 'how',
 'can',
 'i',
 'do',
 'this',
 'how',
 'can',
 'i',
 'move',
 'back',
 'in',
 'with',
 'those',
 'people',
 'please',
 'tell',
 'me',
 'theyre',
 'insane',
 'you',
 'know',
 'that',
 'hey',
 'my',
 'parents',
 'are',
 'just',
 'as',
 'crazy',
 'as',
 'your',
 'parents',
 'how',
 'can',
 'you',
 'compare',
 'you',
 'parents',
 'to',
 'my',
 'parents',
 'my',
 'father',
 'has',
 'never',
 'thrown',
 'anything',
 'out',
 'ever',
 'my',
 'father',
 'wears',
 'his',
 'sneakers',
 'in',
 'the',
 'pool',
 'sneakers',
 'my',
 'mother',
 'has',
 'never',
 'set',
 'foot',
 'in',
 'a',
 'natural',
 'body',
 'of',
 'water',
 'listen',
 'carefully',
 'my',
 'mother',
 'has',
 'never',
 'laughed',
 'ever',
 'not',
 'a',
 'giggle',
 'not',
 'a',
 'chuckle',
 'not',
 'a',
 'tee-hee',
 'never',
 'went',
 'ha',
 'a',
 'smirk',
 'maybe',
 'and',
 'im',
 'moving',
 'back',
 'in',
 'there',
 'i',
 'told',
 '

In [51]:
import re

def cleaned_text(raw_text):
    return re.sub("[\(\[].*?[\)\]]", "", raw_text)

In [52]:
cleaned_text_episode_2 = cleaned_text(raw_text_episode_2)
cleaned_text_episode_2

'  GEORGE: I can\'t believe this! JERRY: Oh, it won\'t be for that long. GEORGE: How can I do this?! How can I move back in with those people? Please, tell me! They\'re insane! You know that. JERRY: Hey, my parents are just as crazy as your parents. GEORGE: How can you compare you parents to my parents?! JERRY: My father has never thrown anything out. Ever! GEORGE: My father wears his sneakers in the pool! Sneakers! JERRY: My mother has never set foot in a natural body of water. GEORGE:  Listen carefully. My mother has never laughed. Ever. Not a giggle, not a chuckle, not a tee-hee.. never went \'Ha!\' JERRY: A smirk? GEORGE: Maybe!.. And I\'m moving back in there! JERRY: I told you I\'d lend you the money for the rent. GEORGE: No, no, no, no. Borrowing money from a friend is like having sex. It just completely changes the relationship.  KRAMER: Alright. I\'m ready.  You know, I still don\'t understand - why do you want to move back in with your parents? GEORGE: I don\'t want to! I\'m 

In [54]:
cleaned_text_episode_2 = clean_text(raw_text_episode_2)

In [55]:
cleaned_text_episode_2

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'i',
 'i',
 'i',
 'i',
 'i',
 "can't",
 "can't",
 "can't",
 "can't",
 'cant',
 'believe',
 'believe',
 'believe',
 'believe',
 'believe',
 'this!',
 'this!',
 'this!',
 'this',
 'this',
 'oh,',
 'oh',
 'oh',
 'oh',
 'oh',
 'it',
 'it',
 'it',
 'it',
 'it',
 "won't",
 "won't",
 "won't",
 "won't",
 'wont',
 'be',
 'be',
 'be',
 'be',
 'be',
 'for',
 'for',
 'for',
 'for',
 'for',
 'that',
 'that',
 'that',
 'that',
 'that',
 'long',
 'long',
 'long',
 'long',
 'long',
 'how',
 'how',
 'how',
 'how',
 'how',
 'can',
 'can',
 'can',
 'can',
 'can',
 'i',
 'i',
 'i',
 'i',
 'i',
 'do',
 'do',
 'do',
 'do',
 'do',
 'this?!',
 'this?!',
 'this!',
 'this',
 'this',
 'how',
 'how',
 'how',
 'how',
 'how',
 'can',
 'can',
 'can',
 'can',
 'can',
 'i',
 'i',
 'i',
 'i',
 'i',
 'move',
 'move',
 'move',
 'move',
 'move',
 'back',
 'back',
 'back',
 'back',
 'back',
 'in',
 'in',
 'in',
 'in',
 'in',
 'with',
 'with',
 'with',
 'with',
 'with',
 't

## Tokenize The Sentence

In [8]:
def tokenize(cleaned_text):
    joined_sentence = ' '.join(cleaned_text)
    tokenized_sentence = word_tokenize(joined_sentence)
    
    return tokenized_sentence


In [9]:
tokenized_text_episode_2 = tokenize(cleaned_text_episode_2)
tokenized_text_episode_2[:10]

['jerrys',
 'and',
 'george',
 'are',
 'waiting',
 'for',
 'kramer',
 'so',
 'he',
 'can']

# Vectorization

## Method 1: Count Vectorization

In [None]:
def count_vectorize(episode, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(episode))
    
    episode_dict = {i:0 for i in unique_words}
    
    for word in episode:
        episode_dict[word] += 1
    
    return episode_dict

test_vectorized = count_vectorize(cleaned_text_episode_1)
print(cleaned_text_episode_1)

In [None]:
def tf_idf(list_of_dicts):
    # Create empty dictionary containing full vocabulary of entire corpus
    doc_tf_idf = {}
    idf = inverse_document_frequency(list_of_dicts)
    full_vocab_list = {i:0 for i in list(idf.keys())}
    
    # Create tf-idf list of dictionaries, containing a dictionary that will be updated for each document
    tf_idf_list_of_dicts = []
    
    # Now, compute tf and then use this to compute and set tf-idf values for each document
    for doc in list_of_dicts:
        doc_tf = term_frequency(doc)
        for word in doc_tf:
            doc_tf_idf[word] = doc_tf[word] * idf[word]
        tf_idf_list_of_dicts.append(doc_tf_idf)
    
    return tf_idf_list_of_dicts

In [None]:
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict