In [1]:
#importing neccessary libraries
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bs4
from datetime import datetime
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#importing relevant spreadsheets
approval = pd.read_csv('C:/Users/brian/Documents/Python/approval_polllist.csv', parse_dates=['enddate'])
sentiment_corpus = pd.read_csv('C:/Users/brian/Documents/Python/nrc.csv')
pos_neg_corpus = pd.read_csv('C:/Users/brian/Documents/Python/bing.csv')
policy_corpus = pd.read_excel('C:/Users/brian/Documents/Policy Corpus.xlsx', header= 0)

In [3]:
#saving the URL that contain all other speech URL
links_from_2017 = 'http://www.presidency.ucsb.edu/ws/index.php?month=&year=2017'
links_from_2018 = 'http://www.presidency.ucsb.edu/ws/index.php?month=&year=2018'

In [4]:
#writing the function to pull out the URL for each speech
links = []
def get_speeches(url):
    url_links = requests.get(url)
    content = url_links.content
    
    #parse the url through html parser Beautiful Soup
    html = bs4.BeautifulSoup(content, 'html.parser')
    
    #Pull out the table from the url that contains the info we want
    html = html.contents[-1]
    
    #pulls out the number of documents this particular url contains
    records = html.find_all('span')[4]
    x = records.text[-3:]
    x = int(x)

    #filter out superflous lines of code that have to deal with html formatting
    section = html.find_all('tr')
    body = section[3:-2]
        
    #for loop that filters out any other author that gave the speeck besides Trump
    for i in range(x):
        if 'Trump' in str(body[i].contents):

    #code that pulls out the query index and attaches it to the main body of the url
            link = 'http://www.presidency.ucsb.edu/ws/' + str(body[i].find('a')['href'])
    
    #appends the new url to the list links         
            links.append(link)


In [5]:
#pull speech URLs from 2017
get_speeches(links_from_2017)
#pull speeche URLs from 2018
get_speeches(links_from_2018)

In [6]:

texts = []
dates = []
titles = []
#for loop to loop over ever speech and pull out the speeches
for i in links:
#pull and get url then parse url
    doc_url = requests.get(i)
    doc = doc_url.content
    parsed = bs4.BeautifulSoup(doc, 'html.parser')
        
    #get the title of the speech
    title = str(parsed.title)[24:-8]
    titles.append(title)

    #get the text of the speech
    text = [i.text for i in parsed.find_all(class_ = 'displaytext')]
    text1 = [i.text for i in parsed.find_all('p')]
    string = text + text1
    words = ' '.join(string)
    texts.append(words)
    
    #get the date of the speech
    date = [i.text for i in parsed.find_all(class_ = 'docdate')]
    date = datetime.strptime(date[0], '%B %d, %Y')
    date1 = datetime.strftime(date, '%m/%d/%Y')
    dates.append(date1)


In [34]:
#creating a corpus of stop words and adding our own words to it
stop_words = stopwords.words('english')
additional = ['us', 'president', 'trump', 'donald', 'vice']
for i in additional:
    stop_words.append(i)

In [35]:
#we are going to clean up the text by getting rid of most puncuation and making sure all words are lower case
#we the create tokens out of each word of each speech
#by the end we have a list of single word tokens nested inside a list that represents the speech
clean_speeches = []
tokens = []
for i in texts:
    x = i.replace(',', ' ')
    x = x.replace('.', ' ')
    x = x.replace('?', ' ')
    x = x.replace("\ ", ' ')
    x = x.replace("'", ' ')
    x = x.replace(':', ' ')
    x = x.replace(';', ' ')
    x = x.replace('!', ' ')
    x = x.replace('-', ' ')
    x = x.replace('—', ' ')
    x = x.replace('"', ' ')
    x = x.replace('*', ' ')
    x = x.replace('(', ' ')
    x = x.replace(')', ' ')
    x = x.replace('&', ' ')
    x = x.replace('#', ' ')
    x = x.replace('%', ' ')
    x = x.replace('$', ' ')
    x = x.replace('[ ', ' ')
    x = x.replace('] ', ' ')
    clean_speeches.append(x)
    tokens.append(word_tokenize(x))

In [None]:
#this is our function to remove stop words later we can have the length of the speeches without stop words
def remove_stop(speeches):
    ### this function is meant to remove stop words from a w list of speeches
    cleaned = []
    for a in speeches:
        pos = []
        for i, x in enumerate(a):
            if x in stop_words:
                pos.append(i)
        for i in reversed(pos):
            a.pop(i)
        cleaned.append(a)
    return cleaned
no_stop = remove_stop(tokens)

In [32]:
#running the speeches through the count vectorizer to create the document-term matrix
#we pass this matrix through Pandas Data Frame to get the bag-of-words model
vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, strip_accents='ascii')
X = vectorizer.fit_transform(clean_speeches)
names = vectorizer.get_feature_names()
speeches_df = pd.DataFrame(X.toarray(), columns=names, index=range(len(X.toarray())))

In [None]:
#here is the approval data frame that will be merged with the speech df
#we have to change the date column to a string format in order to perform a join later
#the date type is a pd.Timestamp which does not want to join to a date time object
mean_adj_approval = approval.groupby('enddate')[['adjusted_approve', 'adjusted_disapprove']].mean()
mean_adj_approval = mean_adj_approval.reset_index()
mean_adj_approval['enddate'] = mean_adj_approval['enddate'].dt.strftime('%m/%d/%Y')


In [None]:
#here is the function to count the number of words from our sentiment corpora 
def sentiment_variable(corpus_df, sentiment= 'feeling'):
    temp = corpus_df[corpus_df['sentiment'] == sentiment]
    temp = temp['word']
    temp = list(temp)
    df = speeches_df.T
    sent_filter = df.reindex(temp)
    sent_count = np.sum(sent_filter)
    return sent_count

In [None]:
#this function counts the number of words in our policy corpus
#both functions return a single column of counts for each document
def policy_variable(corpus_df, policy= 'issue'):
    temp = corpus_df[corpus_df['policy'] == policy]
    temp = temp['word']
    temp = list(temp)
    df = speeches_df.T
    sent_filter = df.reindex(temp)
    sent_count = np.sum(sent_filter)
    return sent_count

In [None]:
#iterate our sentiment function for each sentiment variable 
trust = sentiment_variable(sentiment_corpus, 'trust')
fear = sentiment_variable(sentiment_corpus, 'fear')
sadness = sentiment_variable(sentiment_corpus, 'sadness')
anger = sentiment_variable(sentiment_corpus, 'anger')
surprise = sentiment_variable(sentiment_corpus, 'surprise')
disgust = sentiment_variable(sentiment_corpus, 'disgust')
joy = sentiment_variable(sentiment_corpus, 'joy')
anticipation = sentiment_variable(sentiment_corpus, 'anticipation')
positive = sentiment_variable(pos_neg_corpus, 'positive')
negative = sentiment_variable(pos_neg_corpus, 'negative')

In [None]:
#iterate our policy function over each policy variable
gender = policy_variable(policy_corpus, policy="woman's rights & sexual orientation")
constitution = policy_variable(policy_corpus, policy= 'constitutional rights')
social = policy_variable(policy_corpus, policy = 'socio-cultural issues')
environment = policy_variable(policy_corpus, policy = 'environmental issues')
military = policy_variable(policy_corpus, policy= 'military and national security')
foreign_relations = policy_variable(policy_corpus, policy= 'foreign relations')
healthcare = policy_variable(policy_corpus, policy = 'healthcare & welfare')
economy = policy_variable(policy_corpus, policy= 'us economy')


In [None]:
#pull each variable into a dictionary
speech_data = {}
for i, j in enumerate(dates):
    speech_data[i] = i+1, j, titles[i], len(no_stop[i]), len(pd.unique(no_stop[i])), positive[i], negative[i], trust[i], fear[i], sadness[i], anger[i], surprise[i], disgust[i], joy[i], anticipation[i], gender[i], constitution[i], social[i], environment[i], military[i], foreign_relations[i], healthcare[i], economy[i]
speech_data

In [None]:
#create column names
var_names = ['Doc Number', 'Date', 'Title', 'Speech Length', 'Unique Words', 'Positive', 'Negative', 'Trust', 'Fear', 'Sadness', 'Anger', 'Surprise', 'Disgust', 'Joy', 'Anticipation', 'Womens Issues','Constitution Issues', 'Social Issues', 'Environmental Issues', 'Military/Defense', ' Foreign Relations', 'Healthcare/Welfare', 'US Economy'] 

In [None]:
#create a data frame from our variables and clean it
speeches_df = pd.DataFrame(speech_data)
speeches_df = speeches_df.T
speeches_df.columns = var_names
speeches_df

In [None]:
#need to join our speech data frame with the approval data
cleaned = pd.merge(speeches_df, mean_adj_approval, left_on= 'Date', right_on='enddate', how = 'left')
cleaned[['adjusted_approve', 'adjusted_disapprove']] = cleaned[['adjusted_approve', 'adjusted_disapprove']].bfill()
cleaned.pop('enddate')
cleaned

In [None]:
#we finally export the final data frame to an excel spreadsheet.
cleaned.to_excel('C:/Users/brian/Documents/Python/speech_data.xlsx', index=False)

In [None]:
#def remove_stop(speeches):
    ### this function is meant to remove stop words from a w list of speeches
    #cleaned = []
    #for a in speeches:
        #pos = []
        #for i, x in enumerate(a):
            #if x in stop_words:
                #pos.append(i)
        #for i in reversed(pos):
            #a.pop(i)
        #cleaned.append(a)
    #return cleaned
#FUNCTION WAS WRITTED TO ITERATE THROUGH LIST OF LISTS AND REMOVE STOP WORDS

#THIS CHUNK OF CODE SHOULD STILL SERVE A PURPOSE AND IS SOME OF THE BEST CODE YOU HAVE WRITTEN

In [None]:
#THE CODE IN # IS VERY TIME CONSUMING TO RUN SO I HAVE BLOCKED IT OUT
#it produces a term frequency plot

#distinct_words = []
#for i in range(842):
    #vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, strip_accents='ascii', min_df=i/1020)
    #X = vectorizer.fit_transform(clean_speeches)
    #distinct_words.append(X.toarray().shape[1])
#_ = plt.plot(distinct_words)
#_ = plt.ylabel('# of Unique Words')
#_ = plt.xlabel('# of Documents')
#_ = plt.title('Term Frequency Plot')
#plt.show()

In [None]:
#viewing how much percentage of a speech is disregarded when combining it with the corpus we use
#I THINK THAT IT IS IMPORTANT TO NOTE HOW MUCH OF THE SPEECH WE ARE LOOSING WHEN WE ARE COMPARING IT TO THE DIFFERENT
#VARIABLE CORPORA
#import numpy as np
#test = pd.DataFrame(no_stop_words, columns=['Word'])
#test2 = pd.merge(test, sentiment_corpus, left_on= 'Word', right_on='word', how='left')
#t2 = test2.isna()
#np.sum(t2['sentiment'])/len(t2['sentiment'])