In [21]:
#Import the relevand section of the NLP library
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize

import re

from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package punkt to /Users/Alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
seinfeld_directory = 'Seinfeld_Episodes/Season_5/'

seinfeld_season_5_episodes = ['S05_E01_The_Mango.txt', 'S05_E02_The_Puffy_Shirt.txt',
                              'S05_E03_The_Glasses.txt', 'S05_E04_The_Sniffing_Accountant.txt',
                              'S05_E05_The_Bris.txt', 'S05_E06_The_Lip_Reader.txt',
                              'S05_E07_The_Non_Fat_Yogurt.txt', 'S05_E08_The_Barber.txt',
                              'S05_E09_The_Masseuse.txt', 'S05_E10_The_Cigar_Store_Indian.txt',
                              'S05_E11_The_Conversion.txt', 'S05_E12_The_Stall.txt',
                              'S05_E13_The_Dinner_Party.txt', 'S05_E14_The_Marine_Biologist.txt',
                              'S05_E15_The_Pie.txt', 'S05_E16_The_Stand-In.txt',
                              'S05_E17_The_Wife.txt', 'S05_E18_The_Raincoats_Part_1.txt',
                              'S05_E19_The_Raincoats_Part_2.txt', 'S05_E20_The_Fire.txt',
                              'S05_E21_The_Hamptons.txt', 'S05_E22_The Opposite.txt']

In [3]:
def convert_file_names_to_dictionary(directory_name, list_of_file_names):

    episode_info = []
    
    show_name = " ".join(directory_name.split('/')[0].split('_')[:-1])
    
    season = int(" ".join(directory_name.split('/')[1].split('_')[1:]))
    
    for i in list_of_file_names:
        
        i_contents = i.split('_')
        
        episode = int(i_contents[1][1:])
        
        episode_name = " ".join(i_contents[2:])[:-4]
        
        episode_info.append({'show_name': show_name, 'season': season, 'episode': episode, 'episode_name': episode_name, 'file_path': directory_name + i})
        
    return episode_info

In [4]:
seinfeld_dict = convert_file_names_to_dictionary(seinfeld_directory, seinfeld_season_5_episodes)

In [5]:
df = pd.DataFrame(seinfeld_dict)
df.head()

Unnamed: 0,episode,episode_name,file_path,season,show_name
0,1,The Mango,Seinfeld_Episodes/Season_5/S05_E01_The_Mango.txt,5,Seinfeld
1,2,The Puffy Shirt,Seinfeld_Episodes/Season_5/S05_E02_The_Puffy_S...,5,Seinfeld
2,3,The Glasses,Seinfeld_Episodes/Season_5/S05_E03_The_Glasses...,5,Seinfeld
3,4,The Sniffing Accountant,Seinfeld_Episodes/Season_5/S05_E04_The_Sniffin...,5,Seinfeld
4,5,The Bris,Seinfeld_Episodes/Season_5/S05_E05_The_Bris.txt,5,Seinfeld


In [6]:
df.loc[0].file_path

'Seinfeld_Episodes/Season_5/S05_E01_The_Mango.txt'

In [7]:
def open_file(file_path):

    with open(file_path, 'r') as file:
        raw_text = file.read().replace('\n', ' ')
    
    return raw_text

In [8]:
open_file(df.loc[0].file_path)

"[location: nightclub] JERRY: A female orgasm is kinda like the bat cave. A very few people know where it is and if you're lucky enough to see it you probably don't know how you got there and you can't find you way back after you left. You know there are two types of female orgasm: the real and the fake. And I'll tell you right now, as a man, we don't know. We do not know, because to man sex is like a car accident and determining the female orgasm is like being asked 'What did you see after the car went out of control?'. 'I heard a lot of screeching sounds. I remember I was facing the wrong way at one point. And in the end my body was thrown clear. [location: Monk's] JERRY: So, what's her name? GEORGE: Karin. JERRY: Is she nice? GEORGE: Great. JERRY: So you like her? GEORGE: I think so. JERRY: You don't know? GEORGE: I can't tell anymore. JERRY: Well do you feel anything? GEORGE: Feel? What's that? JERRY: All right, let me ask you this: when she comes over, you're cleaning up a lot? GE

In [9]:
raw_text_episode_2 = open_file(df.loc[1].file_path)

In [10]:
def cleaned_episode(raw_text):
    
    raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)

    for symbol in "*,.?!''\n":
        raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()
  
    cleaned_text = raw_text_no_notes.split(" ")    
    
    for i in cleaned_text:
        
        if i.endswith(':') == True or i == '' or i == ' ':
            cleaned_text.remove(i)
            
        i = i.replace('.', '')
        i = i.replace('?', '')
        i = i.replace('!', '')

     
    return cleaned_text

In [11]:
for i in df.file_path:
    episode_text = open_file(i)
    print(cleaned_episode(episode_text)[:20])

['a', 'female', 'orgasm', 'is', 'kinda', 'like', 'the', 'bat', 'cave', 'a', 'very', 'few', 'people', 'know', 'where', 'it', 'is', 'and', 'if', 'youre']
['i', 'cant', 'believe', 'this', 'oh', 'it', 'wont', 'be', 'for', 'that', 'long', 'how', 'can', 'i', 'do', 'this', 'how', 'can', 'i', 'move']
['i', 'never', 'get', 'enough', 'sleep', 'i', 'stay', 'up', 'late', 'at', 'night', 'cause', 'im', 'night', 'guy', 'night', 'guy', 'wants', 'to', 'stay']
['jerry', 'george', 'and', 'elaine', 'at', 'monks', 'caf', 'so', 'does', 'he', 'like', 'you', 'what', 'do', 'you', 'think', 'you', 'like', 'him', 'yeah']
['act', 'one', 'scene', 'a', 'int', 'hospital', 'room', '-', 'day', 'jerry', 'elaine', 'and', 'george', 'are', 'visiting', 'with', 'new', 'parents', 'myra', 'and']
['at', 'the', 'comedy', 'club', 'professional', 'tennis', 'to', 'me', 'i', 'dont', 'understand', 'all', 'the', 'shushing', 'why', 'are', 'they', 'always', 'shushing', 'shh']
['jerry', 'ive', 'always', 'been', 'a', 'big', 'fan', 'of', '

# Part Two: tokenize/stemming/lemmatization

## Tokenize

In [12]:
episode_1_text = open_file(df.loc[0].file_path)
episode_1_text[:500]

"[location: nightclub] JERRY: A female orgasm is kinda like the bat cave. A very few people know where it is and if you're lucky enough to see it you probably don't know how you got there and you can't find you way back after you left. You know there are two types of female orgasm: the real and the fake. And I'll tell you right now, as a man, we don't know. We do not know, because to man sex is like a car accident and determining the female orgasm is like being asked 'What did you see after the c"

In [13]:
cleaned_episode_1_text = cleaned_episode(episode_1_text)
" ".join(cleaned_episode_1_text[:500])

'a female orgasm is kinda like the bat cave a very few people know where it is and if youre lucky enough to see it you probably dont know how you got there and you cant find you way back after you left you know there are two types of female the real and the fake and ill tell you right now as a man we dont know we do not know because to man sex is like a car accident and determining the female orgasm is like being asked what did you see after the car went out of control i heard a lot of screeching sounds i remember i was facing the wrong way at one point and in the end my body was thrown clear so whats her name karin is she nice great so you like her i think so you dont know i cant tell anymore well do you feel anything feel whats that all right let me ask you when she comes over youre cleaning up a lot yeah youre just straightening up or youre cleaning cleaning you do the tub yeah on your knees ajax scrubbing the whole deal yeah okay i think youre in love tub is love tub is love so the

In [14]:
def tokenize(clean_text):
    joined_episode = ' '.join(clean_text)
    tokenized_episode = word_tokenize(joined_episode)
    
    return tokenized_episode

In [15]:
tokenize_episode_1_text = tokenize(cleaned_episode_1_text)
" ".join(tokenize_episode_1_text[:500])

'a female orgasm is kinda like the bat cave a very few people know where it is and if youre lucky enough to see it you probably dont know how you got there and you cant find you way back after you left you know there are two types of female the real and the fake and ill tell you right now as a man we dont know we do not know because to man sex is like a car accident and determining the female orgasm is like being asked what did you see after the car went out of control i heard a lot of screeching sounds i remember i was facing the wrong way at one point and in the end my body was thrown clear so whats her name karin is she nice great so you like her i think so you dont know i cant tell anymore well do you feel anything feel whats that all right let me ask you when she comes over youre cleaning up a lot yeah youre just straightening up or youre cleaning cleaning you do the tub yeah on your knees ajax scrubbing the whole deal yeah okay i think youre in love tub is love tub is love so the

## Lemmetization

In [16]:
#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

import nltk
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
tokenize_episode_1_text

['a',
 'female',
 'orgasm',
 'is',
 'kinda',
 'like',
 'the',
 'bat',
 'cave',
 'a',
 'very',
 'few',
 'people',
 'know',
 'where',
 'it',
 'is',
 'and',
 'if',
 'youre',
 'lucky',
 'enough',
 'to',
 'see',
 'it',
 'you',
 'probably',
 'dont',
 'know',
 'how',
 'you',
 'got',
 'there',
 'and',
 'you',
 'cant',
 'find',
 'you',
 'way',
 'back',
 'after',
 'you',
 'left',
 'you',
 'know',
 'there',
 'are',
 'two',
 'types',
 'of',
 'female',
 'the',
 'real',
 'and',
 'the',
 'fake',
 'and',
 'ill',
 'tell',
 'you',
 'right',
 'now',
 'as',
 'a',
 'man',
 'we',
 'dont',
 'know',
 'we',
 'do',
 'not',
 'know',
 'because',
 'to',
 'man',
 'sex',
 'is',
 'like',
 'a',
 'car',
 'accident',
 'and',
 'determining',
 'the',
 'female',
 'orgasm',
 'is',
 'like',
 'being',
 'asked',
 'what',
 'did',
 'you',
 'see',
 'after',
 'the',
 'car',
 'went',
 'out',
 'of',
 'control',
 'i',
 'heard',
 'a',
 'lot',
 'of',
 'screeching',
 'sounds',
 'i',
 'remember',
 'i',
 'was',
 'facing',
 'the',
 'wrong'

In [18]:
def lemmatize_text(tokenized_text):
    return ' '.join([lemmatizer.lemmatize(w) for w in tokenized_text])

# One Function To Process Text

In [20]:
def process_text(file_name):
    raw_episode_text = open_file(file_name)
    clean_episode_text = cleaned_episode(raw_episode_text)
    tokenize_episode_text = tokenize(clean_episode_text)
    lemmatize_episode_text = lemmatize_text(tokenize_episode_text)
    
    return lemmatize_episode_text

In [30]:
df['tokenized_text'] = df.apply(lambda x: process_text(x))
df.head()

TypeError: ('expected str, bytes or os.PathLike object, not Series', 'occurred at index episode')

In [19]:
lemmatize_text(tokenize_episode_1_text)

'a female orgasm is kinda like the bat cave a very few people know where it is and if youre lucky enough to see it you probably dont know how you got there and you cant find you way back after you left you know there are two type of female the real and the fake and ill tell you right now a a man we dont know we do not know because to man sex is like a car accident and determining the female orgasm is like being asked what did you see after the car went out of control i heard a lot of screeching sound i remember i wa facing the wrong way at one point and in the end my body wa thrown clear so whats her name karin is she nice great so you like her i think so you dont know i cant tell anymore well do you feel anything feel whats that all right let me ask you when she come over youre cleaning up a lot yeah youre just straightening up or youre cleaning cleaning you do the tub yeah on your knee ajax scrubbing the whole deal yeah okay i think youre in love tub is love tub is love so there you 

# Part Three: Vectorizing the Text

In [22]:
#You Must pass CountVectorizer a series, not a dataframe
count_vectorizer = CountVectorizer()
