In [14]:
import pandas as pd
import re

In [2]:
def open_file(file_path):

    with open(file_path, 'r') as file:
        raw_text = file.read().replace('\n', ' ')
    
    return raw_text

In [3]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')

# It is generally a good idea to also remove punctuation
import string

# Now we have a list that includes all english stopwords, as well as all punctuation
stopwords_list += list(string.punctuation)

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
rolling_stones_article = open_file("sample_text/December's Children (And Everybody's).txt")

In [6]:
from nltk import word_tokenize

tokens = word_tokenize(rolling_stones_article)

# It is usually a good idea to lowercase all tokens during this step, as well
stopped_tokens = [w.lower() for w in tokens if w not in stopwords_list]

In [7]:
stopped_tokens

['december',
 "'s",
 'children',
 'and',
 'everybody',
 "'s",
 'december',
 "'s",
 'children',
 'and',
 'everybody',
 "'s",
 'fifth',
 'american',
 'studio',
 'album',
 'english',
 'rock',
 'band',
 'rolling',
 'stones',
 'released',
 'december',
 '1965',
 'although',
 'largely',
 'draws',
 'songs',
 'issued',
 'earlier',
 'year',
 'united',
 'kingdom',
 'album',
 'includes',
 'three',
 'previously',
 'unreleased',
 'tunes',
 'it',
 'last',
 'group',
 "'s",
 'early',
 'albums',
 'feature',
 'numerous',
 'cover',
 'songs',
 'writers',
 'mick',
 'jagger',
 'keith',
 'richards',
 'wrote',
 'half',
 'songs',
 'there',
 'sessions',
 'record',
 'album',
 'many',
 'songs',
 'drawn',
 'sessions',
 'uk',
 'edition',
 'out',
 'our',
 'heads',
 'september',
 '1965',
 'los',
 'angeles',
 'many',
 'tracks',
 'appeared',
 'earlier',
 'uk',
 'versions',
 'rolling',
 'stones',
 'albums',
 'left',
 'american',
 'counterparts',
 'other',
 'tracks',
 'unreleased',
 'tracks',
 'recorded',
 'recording',
 '

# Updated cleaning function with NLTK

In [9]:
seinfeld_pilot = 'Seinfeld_Episodes/Season_1/S01_E01_The_Seinfeld_Chronicles.txt'

In [11]:
seinfeld_pilot_raw_text = open_file(seinfeld_pilot)

In [12]:
def old_cleaned_episode(raw_text, stop_words = False):
    
    raw_text_no_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)

    for symbol in "*,#-.?!''\n":
        raw_text_no_notes = raw_text_no_notes.replace(symbol, '').lower()
  
    cleaned_text = raw_text_no_notes.split(" ")    
    
    for i in cleaned_text:
        
        if i.endswith(':') == True or i == '' or i == ' ':
            cleaned_text.remove(i)
            
        i = i.replace('.', '')
        i = i.replace('?', '')
        i = i.replace('!', '')
        
    cleaned_text  = [word for word in cleaned_text if word.endswith(':') == False]
    
    if stop_words:
        
        cleaned_text  = [word for word in cleaned_text if word.lower() not in stop_words]
     
    return cleaned_text

In [17]:
old_cleaned_episode(seinfeld_pilot_raw_text, stop_words = False)[:10]

['so', 'im', 'on', 'line', 'at', 'the', 'supermarket', 'two', 'women', 'in']

In [25]:
# Import NLTK's stopwords list
from nltk.corpus import stopwords

# Import NLTK's punctuation list
# '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
import string

# Import NLTK's word tokenizer
from nltk import word_tokenize

# Add only English language stopwords and punctuation list together
nltk_stopwords_list = stopwords.words('english')
nltk_stopwords_list += list(string.punctuation)

In [31]:
def cleaned_episode(raw_text, custom_stop_words = False):
    
    # Copy the NLTK list in case of customization
    stop_words_list = nltk_stopwords_list
    
    # Removes all text between and including brackets and parenthesis with RegEx
    raw_text_no_stage_notes = re.sub("[\(\[].*?[\)\]]", "", raw_text)

    # Remove all text with colons (:), i.e. character line indications
    raw_text_no_stage_notes_or_names = raw_text_no_stage_notes.split(" ")
    
    for i in raw_text_no_stage_notes_or_names:
        
        if i.endswith(':') == True or i == '' or i == ' ':
            raw_text_no_stage_notes_or_names.remove(i)
            
    # Rejoin all of the text as one string for tokenization
    raw_text_rejoined = " ".join(raw_text_no_stage_notes_or_names)
    
    
    # If a list of additional custom stopwords are passed add them to the default
    # NLTK stopwords and punctuation list
    if custom_stop_words:
        
        stop_words_list += custom_stop_words
    
    # Tokenize the raw text
    token_list = word_tokenize(raw_text_rejoined)
    
    # Remove stop words and punctuation
    cleaned_and_tokenized_list = [w.lower() for w in token_list if w not in stop_words_list]

    return cleaned_and_tokenized_list

In [32]:
cleaned_episode(seinfeld_pilot_raw_text, custom_stop_words = False)[:10]

['so',
 'i',
 "'m",
 'line',
 'supermarket',
 'two',
 'women',
 'front',
 'one',
 'total']