<a href="https://colab.research.google.com/github/cecann10/Test/blob/master/project_4_metis/potus_speeches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model to Understand Presidential Sentiment Toward Women
This worksheet is used for EDA, NLP, and Modeling of text on 991 Presidental speeches that span all US Presidents from George Washington to Donald Trump mid-term 2019.

First, mount google drive to work with files places there for this project >

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Now install necessary systems to connect to Google Drive API and import custom modules for this project >

In [None]:
!pip install pydrive                             # Package to use Google Drive API - not installed in Colab VM by default
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

In [6]:
from google.colab import auth                    # Other necessary packages
from oauth2client.client import GoogleCredentials

In [7]:
auth.authenticate_user()                         # Follow prompt in the authorization process
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()

In [8]:
drive = GoogleDrive(gauth)
your_module = drive.CreateFile({"id": "1QT_L6vhXrPpBdY9xFA07YoAgvYR01cNu"})   # id= [part after "id=" in the shareable link of file]
your_module.GetContentFile("nlp_pipe.py")          # Save the .py module file to Colab VM
import nlp_pipe          # import module

Import all libraries and modules required to run this workbook >

In [9]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

# custom modules for this project's NLP
from nlp_pipe import nlp_pipe
from nlp_pipe import cleaning_function

-
### Get the data required for this project

Frist, pull in csv file received from [Kaggle](https://www.kaggle.com/littleotter/united-states-presidential-speeches) that includes presidential speeches from every US President starting with Washingon on 1789-04-30 to Trump on 2019-09-25.  Each row includes:
1. Date of speech
2. President
3. Party of President
4. Speech Title
5. Summary of Speech
6. Transcript
7. URL of source of transcript

In [15]:
# pull in full csv file of presidential speeches

from google.colab import files
uploaded = files.upload()

Saving presidential_speeches.csv to presidential_speeches.csv


In [16]:
# Now save presidential speeches csv as a dataframe

import io
potus_speech = pd.read_csv(io.BytesIO(uploaded['presidential_speeches.csv']))


-
### Exploratory Data Analysis



In [17]:
# see how many speeches are within this file

len(potus_speech)

992

Clean file as needed >

In [18]:
# discovered one speech that is missing, so will remove from data

potus_speech.dropna(subset=['Transcript'], inplace=True)

In [19]:
# verify column deleted

len(potus_speech)

991

In [None]:
potus_speech.columns

Index(['Date', 'President', 'Party', 'Speech Title', 'Summary', 'Transcript',
       'URL'],
      dtype='object')

Create file for just text of the presidental speeches as `transcripts` >

In [45]:
transcripts = potus_speech['Transcript']

In [25]:
# verify single column and includes all transcripts

transcripts.shape

(991,)

In [47]:
# save transcripts as csv

transcripts.to_csv('drive/My Drive/Colab Notebooks/project_4_metis/csv/transcripts.csv')

-
### NLP

In [22]:
potus_nlp = nlp_pipe(cleaning_function)

In [23]:
potus_nlp.fit(transcripts)

In [24]:
transcripts_transformed = potus_nlp.transform(transcripts)

In [26]:
transcripts_transformed.shape

(991, 32122)

In [58]:
# take a look at various words related to women and see how many times used

transcripts_transformed['woman'].value_counts()

0     864
1      85
2      27
3       7
4       5
10      1
7       1
5       1
Name: woman, dtype: int64

# OLD PIPELINE TESTING - MOVED TO PY FILE & CAN DELETE ONCE CERTAIN DON'T NEED OTHER OPTIONS >>

-
### NLP

In [None]:
class nlp_pipe_v3:
    
    def __init__(self, 
                 cleaning_function, 
                 vectorizer=CountVectorizer(), 
                 tokenizer=TreebankWordTokenizer().tokenize, 
                 stemmer=PorterStemmer()):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self._is_fit = False
    
    def fit(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
    
    def transform(self, text):
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        
        return self.vectorizer.transform(clean_text)

In [None]:
#this is the version used in py file!

class nlp_pipe_v2:
    
    def __init__(self, 
                 cleaning_function, 
                 vectorizer=CountVectorizer(), 
                 tokenizer=TreebankWordTokenizer().tokenize, 
                 stemmer=PorterStemmer()):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self._is_fit = False
    
    def fit(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
    
    def transform(self, text):
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        vectorized = self.vectorizer.transform(clean_text)
        
        return pd.DataFrame(vectorized.toarray(),
                           columns = self.vectorizer.get_feature_names())

In [None]:
# Brendan version

def cleaning_function_v2(text, tokenizer, stemmer):
    clean_text = []
    for speech in text:
        tokens = tokenizer(speech)
        
        stemmed = []
        for token in tokens:
            stemmed.append(stemmer.stem(token))
            
        clean_document = " ".join(stemmed)
        clean_text.append(clean_document)
        
    return clean_text

In [None]:
# this is the version used in py file!
# Leon version with slight edit to stem no matter what

def cleaning_function_v3(text, tokenizer, stemmer):
    cleaned_text = []
    for words in text:
        cleaned_words = []
        for word in tokenizer(words):
            low_word = stemmer.stem(word.lower())
            cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [None]:
nlp = nlp_pipe(cleaning_function_)

In [None]:
nlp_v3 = nlp_pipe_v3(cleaning_function)

In [None]:
nlp.fit(transcripts)


In [None]:
nlp_v2.fit(transcripts)

In [None]:
transcripts_transformed = nlp_v2.transform(transcripts)

In [None]:
transcripts_transformed.