In [2]:
#import modules
from __future__ import division
import pandas as pd
import numpy as np 
pd.set_option('display.max_colwidth', 1500)

In [3]:
#import csv into dataframe as original text
speechDF = pd.read_csv("bidenspeech.csv", index_col = 0) 
print(speechDF.shape)
speechDF.head(5)

(219, 1)


Unnamed: 0,original
0,This is America’s day.
1,This is democracy’s day.
2,A day of history and hope.
3,Of renewal and resolve.
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.


In [4]:
#import stopwords
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words("english") 

In [5]:
#examine the token spread
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=False, stop_words = nltk_stopwords) 
cv_dm = cv.fit_transform(speechDF['original'])
print(cv_dm.shape)
names = cv.get_feature_names()
count = np.sum(cv_dm.toarray(), axis = 0)
count2 = count.tolist()
count_df = pd.DataFrame(count2, index = names, columns = ['count'])
count_df.sort_values(['count'], ascending = False)[0:10]

(219, 616)


Unnamed: 0,count
us,27
america,20
one,15
nation,14
democracy,11
must,10
people,9
another,9
american,9
today,9


In [6]:
#import lemmatizer
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [7]:
#lemmatize the original text, add to dataframe
speechDF['lemmatized'] = speechDF["original"].apply(lambda x: [wnl.lemmatize(y, pos = 'v') for y in x.split()])
speechDF['lemmatized']= [" ".join(token) for token in speechDF['lemmatized']]

speechDF.head(5)

Unnamed: 0,original,lemmatized
0,This is America’s day.,This be America’s day.
1,This is democracy’s day.,This be democracy’s day.
2,A day of history and hope.,A day of history and hope.
3,Of renewal and resolve.,Of renewal and resolve.
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.,Through a crucible for the age America have be test anew and America have rise to the challenge.


In [8]:
#examine the tokens from lammetied text
cv_dm = cv.fit_transform(speechDF['lemmatized'])
print(cv_dm.shape)
names = cv.get_feature_names()   
count = np.sum(cv_dm.toarray(), axis = 0) 
count2 = count.tolist()  
count_df = pd.DataFrame(count2, index = names, columns = ['count']) 
count_df.sort_values(['count'], ascending = False)[0:10]  

(219, 582)


Unnamed: 0,count
us,27
america,20
one,15
nation,14
democracy,11
must,10
another,9
people,9
american,9
story,9


In [9]:
# cleaning the lammetized text further, combining some plurals, fixing a few werid cutoffs
speech_dict = {'americans':'american', 'days':'day', 'centuries':'century', 
               'businesses':'business', 'ha':'has', 'periods':'period'}


def multiple_replace(dict, text): 

  text = str(text).lower()
    
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [134]:
#apply the cleaned text to the dataframe
speechDF['cleantext'] = speechDF.lemmatized.apply(lambda x: multiple_replace(speech_dict, x))
speechDF[0:10]

Unnamed: 0,original,lemmatized,cleantext
0,This is America’s day.,This be America’s day.,this be america’s day.
1,This is democracy’s day.,This be democracy’s day.,this be democracy’s day.
2,A day of history and hope.,A day of history and hope.,a day of history and hope.
3,Of renewal and resolve.,Of renewal and resolve.,of renewal and resolve.
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.,Through a crucible for the age America have be test anew and America have rise to the challenge.,through a crucible for the age america hasve be test anew and america hasve rise to the chasllenge.
5,"Today, we celebrate the triumph not of a candidate, but of a cause, the cause of democracy.","Today, we celebrate the triumph not of a candidate, but of a cause, the cause of democracy.","today, we celebrate the triumph not of a candidate, but of a cause, the cause of democracy."
6,The will of the people has been heard and the will of the people has been heeded.,The will of the people have be hear and the will of the people have be heeded.,the will of the people hasve be hear and the will of the people hasve be heeded.
7,We have learned again that democracy is precious.,We have learn again that democracy be precious.,we hasve learn again thast democracy be precious.
8,Democracy is fragile.,Democracy be fragile.,democracy be fragile.
9,"And at this hour, my friends, democracy has prevailed.","And at this hour, my friends, democracy have prevailed.","and at this hour, my friends, democracy hasve prevailed."


## Time For Sentiment Analysis - Going with HL because of the size of the dictionary. Designed from a wider variety of sources, allows for a more detailed analysis.


In [135]:
#set up the HL dictionaries and function
HLpos = [line.strip() for line in  open('HLpos.txt','r')]
HLneg = [line.strip() for line in  open('HLneg.txt','r',encoding = 'latin-1')]

def hl_sent(inputstring):
    poscount = 0
    negcount = 0
    for word in inputstring.split(): 
        if HLpos.count(word.rstrip('?:!.,;')):
            poscount +=1
        elif HLneg.count(word.rstrip('?:!.,;')):
            negcount +=1
    if poscount+negcount > 0:
        t = float((poscount - negcount)/(poscount+negcount))    
    else:
        t = 0
    if t > 0:
        tone = "Positive"
    elif t < 0:
        tone = "Negative"
    else:
        tone = "Neutral"
    return tone

In [136]:
#Run analysis on original text
speechDF['hlsentOG'] = speechDF.original.apply(lambda x: hl_sent(x))

In [137]:
#check results
speechDF

Unnamed: 0,original,lemmatized,cleantext,hlsentOG
0,This is America’s day.,This be America’s day.,this be america’s day.,Neutral
1,This is democracy’s day.,This be democracy’s day.,this be democracy’s day.,Neutral
2,A day of history and hope.,A day of history and hope.,a day of history and hope.,Neutral
3,Of renewal and resolve.,Of renewal and resolve.,of renewal and resolve.,Neutral
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.,Through a crucible for the age America have be test anew and America have rise to the challenge.,through a crucible for the age america hasve be test anew and america hasve rise to the chasllenge.,Neutral
...,...,...,...,...
214,Sustained by faith.,Sustained by faith.,sustained by faith.,Positive
215,Driven by conviction.,Driven by conviction.,driven by conviction.,Neutral
216,"And, devoted to one another and to this country we love with all our hearts.","And, devote to one another and to this country we love with all our hearts.","and, devote to one another and to this country we love with all our hearts.",Positive
217,May God bless America and may God protect our troops.,May God bless America and may God protect our troops.,may god bless america and may god protect our troops.,Positive


In [138]:
#run analysis on lemmatized text
speechDF['hlsentLEM'] = speechDF.lemmatized.apply(lambda x: hl_sent(x))

In [139]:
#check results
speechDF

Unnamed: 0,original,lemmatized,cleantext,hlsentOG,hlsentLEM
0,This is America’s day.,This be America’s day.,this be america’s day.,Neutral,Neutral
1,This is democracy’s day.,This be democracy’s day.,this be democracy’s day.,Neutral,Neutral
2,A day of history and hope.,A day of history and hope.,a day of history and hope.,Neutral,Neutral
3,Of renewal and resolve.,Of renewal and resolve.,of renewal and resolve.,Neutral,Neutral
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.,Through a crucible for the age America have be test anew and America have rise to the challenge.,through a crucible for the age america hasve be test anew and america hasve rise to the chasllenge.,Neutral,Neutral
...,...,...,...,...,...
214,Sustained by faith.,Sustained by faith.,sustained by faith.,Positive,Positive
215,Driven by conviction.,Driven by conviction.,driven by conviction.,Neutral,Neutral
216,"And, devoted to one another and to this country we love with all our hearts.","And, devote to one another and to this country we love with all our hearts.","and, devote to one another and to this country we love with all our hearts.",Positive,Positive
217,May God bless America and may God protect our troops.,May God bless America and may God protect our troops.,may god bless america and may god protect our troops.,Positive,Positive


In [140]:
#run analysis on clean text
speechDF['hlsentCLEAN'] = speechDF.cleantext.apply(lambda x: hl_sent(x))

In [141]:
#check results
speechDF

Unnamed: 0,original,lemmatized,cleantext,hlsentOG,hlsentLEM,hlsentCLEAN
0,This is America’s day.,This be America’s day.,this be america’s day.,Neutral,Neutral,Neutral
1,This is democracy’s day.,This be democracy’s day.,this be democracy’s day.,Neutral,Neutral,Neutral
2,A day of history and hope.,A day of history and hope.,a day of history and hope.,Neutral,Neutral,Neutral
3,Of renewal and resolve.,Of renewal and resolve.,of renewal and resolve.,Neutral,Neutral,Neutral
4,Through a crucible for the ages America has been tested anew and America has risen to the challenge.,Through a crucible for the age America have be test anew and America have rise to the challenge.,through a crucible for the age america hasve be test anew and america hasve rise to the chasllenge.,Neutral,Neutral,Neutral
...,...,...,...,...,...,...
214,Sustained by faith.,Sustained by faith.,sustained by faith.,Positive,Positive,Positive
215,Driven by conviction.,Driven by conviction.,driven by conviction.,Neutral,Neutral,Neutral
216,"And, devoted to one another and to this country we love with all our hearts.","And, devote to one another and to this country we love with all our hearts.","and, devote to one another and to this country we love with all our hearts.",Positive,Positive,Positive
217,May God bless America and may God protect our troops.,May God bless America and may God protect our troops.,may god bless america and may god protect our troops.,Positive,Positive,Positive


## Let's see how these sentiments compare!

In [150]:
print(speechDF.hlsentOG.value_counts())
print('\n')
print(speechDF.hlsentLEM.value_counts())
print('\n')
print(speechDF.hlsentCLEAN.value_counts())

Neutral     120
Positive     63
Negative     36
Name: hlsentOG, dtype: int64


Neutral     120
Positive     61
Negative     38
Name: hlsentLEM, dtype: int64


Neutral     117
Positive     67
Negative     35
Name: hlsentCLEAN, dtype: int64


## Q1 - 
For this assignment I wanted to do a sentiment analysis on the recent inaugural address by President Joe Biden. I found his speech on the White House website and input it into a csv file for easy access. I first imported the data into a dataframe, and began to count vectorize the tokens in the overall corpus. I wanted to understand what words I should look out for while trying to clean the text. Next I lemmatized the text. I chose to lemmatize because I knew that it would keep the text as true to natural words as possible, seeing that this is a speech, I wanted to maintain the text closest to the original format. To run this lemmatization I performed it based on the ‘verbs’ as its part of speech. Presidential addresses tend to be about action, so using verbs felt the most appropriate. The results of this process were mostly as expected, and I believe using ‘verbs’ was the correct choice. It tended to correct the tense of the verbs used, from ‘spoke’ to ‘speak’, for example. However, I did find it interesting that the algorithm liked to replace the word ‘is’ and ‘been’ with the word ‘be’. I can understand the idea behind present tensing ‘been’ but the word ‘is’ seemed straight forward enough, that I was surprised to see it changed. After the text was lemmatized, I count vectorized it again to see what words were left over. 

From analyzing the token counts of the lemmatized text, I could see there were a few ‘mishaps’ were made that I wanted to manually clean. The lemmatized didn’t seem to pick up on many plurals, so I combined them, such as ‘Americans’, ‘days’, and ‘centuries’, for example, with their singular versions. I also replaced a couple of ‘ha’ that had been cut from ‘has’. After this cleaning I applied it to the dataframe and got ready for the sentiment analysis.

## Q2 -
For the sentiment analysis, I decided to go with the HL dictionary. I felt that since it had a larger dictionary to work from, it could handle a bit more in its determination. While researching these dictionaries, I liked that the HL groups came from multiple sources of language, not just tweets, for example.

I then applied the HL analysis to all three of my text groups: original, lemmatized, and cleaned. The results were mostly as expected. I was however expecting a heavier presence of positive sentiment, but most of what was analyzed came back as neutral. However, positive did win over negative each time. The President’s speech came off a very dark year in our country, but I remember him putting a lot of emphasis on the good days to come. So it was nice to see that the analyzer picked up on more positives than negatives. It was interesting to see that in one line, the president thanks his predecessors for their presence at the event. Both the original and lemmatized text found this positive, yet the clean text found it neutral. And another interesting assessment found that the original and lemmatized texts found the single word line of “unity.” to be neutral, yet when the clean text uses lower case “unity.” It is found as positive.

I do not think that one type of text provided a more accurate portrayal of the sentiment than the other. There were not many differences between the three analysis. Less than 10 lines switched from one group to the next. I do think that if I could make any changes, seeing as all three were most dominated by neutral sentiment, that I would make each document a combination of a few lines, I would want to give the algorithm more text to make its decision on. I think the source may have broken up the lines just enough that they ended up mostly being neutral, whereas the previous or next line could make a large impression on the overall sentiment when read in context.