In [92]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

-
### Get the data required for this project

Frist, pull in csv file received from [Kaggle](https://www.kaggle.com/littleotter/united-states-presidential-speeches) that includes presidential speeches from every US President starting with Washingon on 1789-04-30 to Trump on 2019-09-25.  Each row includes:
1. Date of speech
2. President
3. Party of President
4. Speech Title
5. Summary of Speech
6. Transcript
7. URL of source of transcript

In [66]:
# pull in full file of presidential speeches

potus_speech = pd.read_csv('presidential_speeches.csv')

-
### Exploratory Data Analysis



In [67]:
# see how many speeches are within this file

len(potus_speech)

992

In [74]:
# discovered one speech that is missing, so will remove from data

potus_speech.dropna(subset=['Transcript'], inplace=True)

In [77]:
# verify column deleted

len(potus_speech)

991

In [78]:
potus_speech.columns

Index(['Date', 'President', 'Party', 'Speech Title', 'Summary', 'Transcript',
       'URL'],
      dtype='object')

In [79]:
transcripts = potus_speech['Transcript']

In [87]:
transcripts.shape

(991,)

In [86]:
for document in transcripts:
    tokens = TreebankWordTokenizer().tokenize(document)
print(len(tokens))

7432


-
### NLP

In [151]:
class nlp_pipe:
    
    def __init__(self, 
                 cleaning_function, 
                 vectorizer=CountVectorizer(), 
                 tokenizer=TreebankWordTokenizer().tokenize, 
                 stemmer=PorterStemmer()):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self._is_fit = False
    
    def fit(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
    
    def transform(self, text):
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        
        return self.vectorizer.transform(clean_text)

In [168]:
class nlp_pipe_v2:
    
    def __init__(self, 
                 cleaning_function, 
                 vectorizer=CountVectorizer(), 
                 tokenizer=TreebankWordTokenizer().tokenize, 
                 stemmer=PorterStemmer()):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self._is_fit = False
    
    def fit(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
    
    def transform(self, text):
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        vectorized = self.vectorizer.transform(clean_text)
        
        return pd.DataFrame(vectorized.toarray(),
                           columns = self.vectorizer.get_feature_names()
                           ).head()

In [139]:
# Brendan version

def cleaning_function_v2(text, tokenizer, stemmer):
    clean_text = []
    for speech in text:
        tokens = tokenizer(speech)
        
        stemmed = []
        for token in tokens:
            stemmed.append(stemmer.stem(token))
            
        clean_document = " ".join(stemmed)
        clean_text.append(clean_document)
        
    return clean_text

In [152]:
# Leon version with slight edit to stem no matter what

def cleaning_function(text, tokenizer, stemmer):
    cleaned_text = []
    for words in text:
        cleaned_words = []
        for word in tokenizer(words):
            low_word = stemmer.stem(word.lower())
            cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [153]:
nlp = nlp_pipe(cleaning_function)

In [169]:
nlp_v2 = nlp_pipe_v2(cleaning_function)

In [158]:
nlp.fit(transcripts)


In [170]:
nlp_v2.fit(transcripts)

In [171]:
transcripts_transformed = nlp_v2.transform(transcripts)

In [177]:
transcripts_transformed['the']

0    117
1     25
2     76
3    120
4    117
Name: the, dtype: int64

In [173]:
#old method that at least gets array for nlp (version 1)

transcripts_vec = nlp.transform(transcripts).toarray()

transcripts_vec.shape

(991, 32122)