In [1]:
# import packages
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
from urllib.parse import urlparse
import urllib.robotparser
from bs4 import BeautifulSoup

# This code checks the robots.txt file
def canFetch(url):

    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(domain + "/robots.txt")
    try:
        rp.read()
        canFetchBool = rp.can_fetch("*", url)
    except:
        canFetchBool = None
    
    return canFetchBool

# Trump vs Biden Wikipedia Sentiment Analysis: 

My question is which president as a greater positive correlation with a sentiment analysis and counting word frequency. A residual question of whether Wikipedia (a website solely based on facts not opinion) has biases when describing Trump or Biden can be answered with this sentiment analysis as well. I will be performing webscraping on two Wikipedia sites, one about Trump and another about Biden. Then I will clean the data to only include sentences with either Trump or Biden. I will find out what word is most frequent in both texts. Then I will perform a sentiment analysis and find the mean of the compound score and compare the scores of the two political figures.


In [2]:
url_trump = "https://en.wikipedia.org/wiki/Donald_Trump" # Trump wiki page  
canFetch(url_trump)
r_trump = requests.get(url_trump)
urlText_trump = r_trump.text  # grabs the text from trump article

url_biden = "https://en.wikipedia.org/wiki/Joe_Biden" # Biden wiki page
canFetch(url_biden)
r_biden = requests.get(url_biden) 
urlText_biden = r_biden.text # grabs the text from biden article


In [3]:
# uses BeautifulSoup to parse through the article and get the information I need

soup = BeautifulSoup(urlText_trump, 'html.parser')
page_response = requests.get(url_trump,timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
#we use the html parser to parse the url content and store it in a variable.

paragraphs = page_content.find_all('p') # finds all paragraph tags in article

text_content_trump = [] # list of all sentences in biden article
for paragraph in paragraphs:
    text_content_trump.append(paragraph.get_text().strip()) # removes html tags from each sentence and adds to list

text_content_trump = " ".join(text_content_trump).split('.') # splits by period so each component is a sentence

# only includes sentences that include Trump
text_content_trump = [s for s in text_content_trump if ('TRUMP' in s.upper()) or ('DONALD' in s.upper())]
print(text_content_trump[:5])

[' Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021', ' Trump received a Bachelor of Science in economics from the University of Pennsylvania in 1968, and his father named him president of his real estate business in 1971', ' Trump renamed it the Trump Organization and reoriented the company toward building and renovating skyscrapers, hotels, casinos, and golf courses', ' After a series of business failures in the late twentieth century, he successfully launched side ventures that required little capital, mostly by licensing the Trump name', ' Trump won the 2016 presidential election as the Republican Party nominee against Democratic Party nominee Hillary Clinton while losing the popular vote']


In [4]:
# uses BeautifulSoup to parse through the article and get the information I need

soup = BeautifulSoup(urlText_biden, 'html.parser')
page_response = requests.get(url_biden,timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
#we use the html parser to parse the url content and store it in a variable.

paragraphs = page_content.find_all('p') # finds all paragraph tags in article

text_content_biden = [] # list of all sentences in biden article
for paragraph in paragraphs:
    text_content_biden.append(paragraph.get_text().strip()) # removes html tags from each sentence and adds to list

text_content_biden = " ".join(text_content_biden).split('.') # splits by period so each component is a sentence

# only includes sentences that include Biden
text_content_biden = [s for s in text_content_biden if ('BIDEN' in s.upper()) or ('JOE' in s.upper())]
print(text_content_biden[:5])

[' Joseph Robinette Biden Jr', ' Born in Scranton, Pennsylvania, Biden moved with his family to Delaware in 1953', ' As a senator, Biden drafted and led the effort to pass the Violent Crime Control and Law Enforcement Act and the Violence Against Women Act', ' Biden ran unsuccessfully for the Democratic presidential nomination in 1988 and 2008', ' In 2008, Obama chose Biden as his running mate, and he was a close counselor to Obama during his two terms as vice president']


In [5]:
text_content_trump_words = " ".join(text_content_trump).split() # creates a list of all words
text_content_trump_words = [word for word in text_content_trump_words if (len(word) >= 3) and (word.upper() != \
'THE') and (word.upper() != 'AND')] #removes stop words
print("Total words: " + str(len(text_content_trump_words)))
print("Distinct words: " + str(len(set(text_content_trump_words))))

Total words: 8527
Distinct words: 3733


In [6]:
text_content_biden_words = " ".join(text_content_biden).split() # creates a list of all words
text_content_biden_words = [word for word in text_content_biden_words if (len(word) >= 3) and (word.upper() != \
'THE') and (word.upper() != 'AND')] #removes stop words
print("Total words: " + str(len(text_content_biden_words)))
print("Distinct words: " + str(len(set(text_content_biden_words))))

Total words: 5870
Distinct words: 2878


In [7]:
# creates a dictionary of words in the Trump article as keys and values of the amount of times it was shown

trump_categorical_distribution = {} # empty dictionary
for word in text_content_trump_words:
    if word in trump_categorical_distribution:
        trump_categorical_distribution[word] = trump_categorical_distribution[word] + 1 # word in dict update count
    else:
        trump_categorical_distribution[word] = 1 # if word not already in dict then set count to 1

# displays the top 15 most frequent words in the dictionary
trump_most_words = dict(sorted(trump_categorical_distribution.items(), key = lambda x: x[1], reverse = True)[:15])
trump_most_words

{'Trump': 424,
 'his': 133,
 'that': 123,
 'for': 103,
 "Trump's": 101,
 'was': 85,
 'with': 75,
 'from': 58,
 'had': 56,
 'were': 47,
 'not': 39,
 'election': 31,
 'against': 28,
 'presidential': 26,
 'which': 26}

### As the dictionary shows, the most common word in the Trump article that is an adjective is 'not'. This word does have a negative connotation but it isn't substantial enough to make any claims. The sentiment analysis will provide a deeper analysis into this.

In [8]:
# creates a dictionary of words in the Biden article as keys and values of the amount of times it was shown

biden_categorical_distribution = {} # empty dictionary
for word in text_content_biden_words:
    if word in biden_categorical_distribution:
        biden_categorical_distribution[word] = biden_categorical_distribution[word] + 1 # word in dict update count
    else:
        biden_categorical_distribution[word] = 1 # if word not already in dict then set count to 1

# displays the top 20 most frequent words in dictionary
biden_most_words = dict(sorted(biden_categorical_distribution.items(), key = lambda x: x[1], reverse = True)[:20])
biden_most_words

{'Biden': 295,
 'his': 104,
 'for': 94,
 'was': 71,
 'that': 71,
 'with': 59,
 "Biden's": 56,
 'had': 45,
 'Obama': 34,
 'has': 29,
 'from': 26,
 'but': 24,
 'Senate': 24,
 'president': 21,
 'first': 19,
 'said': 19,
 'Act': 18,
 'not': 18,
 'during': 17,
 'after': 17}

### As the dictionary shows, the most common word in the Biden article that is an adjective is 'not'. Although this is the same word as the Trump article, the number of times it appears in the Trump article is double that of the Biden article.

In [9]:
# creates a sentiment score for each sentence in the biden article

sid = SentimentIntensityAnalyzer()

trumpSentiments = []

for sentence in text_content_trump:
    sentenceSentiment = sid.polarity_scores(sentence) # calculates the polarity score for each sentence
    sentenceSentiment['text'] = sentence
    trumpSentiments.append(sentenceSentiment) # appends score dictionary to the list
    
trumpSentiments[:5] # a list of dictionaries, with text of Trump sentences and sentiments

[{'neg': 0.0,
  'neu': 0.909,
  'pos': 0.091,
  'compound': 0.4215,
  'text': ' Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'text': ' Trump received a Bachelor of Science in economics from the University of Pennsylvania in 1968, and his father named him president of his real estate business in 1971'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'text': ' Trump renamed it the Trump Organization and reoriented the company toward building and renovating skyscrapers, hotels, casinos, and golf courses'},
 {'neg': 0.101,
  'neu': 0.741,
  'pos': 0.158,
  'compound': 0.1779,
  'text': ' After a series of business failures in the late twentieth century, he successfully launched side ventures that required little capital, mostly by licensing the Trump name'},
 {'neg': 0.083,
  'neu': 

In [10]:
# creates a sentiment score for each sentence in the biden article

bidenSentiments = []

for sentence in text_content_biden:
    sentenceSentiment = sid.polarity_scores(sentence) # calculates the polarity score for each sentence
    sentenceSentiment['text'] = sentence
    bidenSentiments.append(sentenceSentiment) # appends score dictionary to the list
    
bidenSentiments[:5] # a list of dictionaries, with text of Biden sentences and sentiments

[{'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'text': ' Joseph Robinette Biden Jr'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'text': ' Born in Scranton, Pennsylvania, Biden moved with his family to Delaware in 1953'},
 {'neg': 0.354,
  'neu': 0.646,
  'pos': 0.0,
  'compound': -0.91,
  'text': ' As a senator, Biden drafted and led the effort to pass the Violent Crime Control and Law Enforcement Act and the Violence Against Women Act'},
 {'neg': 0.197,
  'neu': 0.803,
  'pos': 0.0,
  'compound': -0.4019,
  'text': ' Biden ran unsuccessfully for the Democratic presidential nomination in 1988 and 2008'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'text': ' In 2008, Obama chose Biden as his running mate, and he was a close counselor to Obama during his two terms as vice president'}]

In [11]:
trumpSentimentDf = pd.DataFrame(trumpSentiments) #puts the biden sentiment list of dictionaries into a dataframe
trumpSentimentDf.sort_values('compound') # sorts the values in the compound column in ascending order

Unnamed: 0,neg,neu,pos,compound,text
454,0.270,0.730,0.000,-0.9371,[776][777] A nationwide review by ABC News in ...
451,0.259,0.689,0.052,-0.9022,[766] During and since the 2020 presidential e...
280,0.256,0.710,0.034,-0.8979,"[473][474][475] By early April, as the pandemi..."
9,0.280,0.663,0.058,-0.8934,Trump refused to concede after losing the 202...
226,0.311,0.689,0.000,-0.8860,[370] Trump said he resisted punishing China f...
...,...,...,...,...,...
246,0.000,0.714,0.286,0.7906,"[406][407] In May 2018, Trump withdrew the Uni..."
135,0.000,0.702,0.298,0.8176,[186] Duke enthusiastically supported Trump an...
380,0.083,0.623,0.293,0.8439,", Trump tweeted that the rioters should ""go ho..."
316,0.000,0.803,0.197,0.8442,"[538] In January 2017, American intelligence a..."


In [12]:
bidenSentimentDf = pd.DataFrame(bidenSentiments) #puts the biden sentiment list of dictionaries into a dataframe
bidenSentimentDf.sort_values('compound') # sorts the values in the compound column in ascending order

Unnamed: 0,neg,neu,pos,compound,text
56,0.386,0.572,0.043,-0.9572,"[89] In 1994, Biden helped pass the Violent Cr..."
2,0.354,0.646,0.000,-0.9100,"As a senator, Biden drafted and led the effor..."
29,0.317,0.683,0.000,-0.8677,[43] Biden had not openly supported or opposed...
9,0.355,0.561,0.084,-0.8658,"During the Israel–Hamas war, Biden announced ..."
77,0.360,0.552,0.088,-0.8555,[122] Biden became interested in the Yugoslav ...
...,...,...,...,...,...
314,0.000,0.617,0.383,0.8519,"[570] Biden supports environmental justice, in..."
68,0.000,0.554,0.446,0.9052,"Conservatives were angered,[110] but at the h..."
266,0.000,0.543,0.457,0.9118,[453] Democrats credited Biden for their unexp...
272,0.000,0.707,0.293,0.9201,"[465] In September 2021, Biden announced AUKUS..."


In [13]:
bidenSentimentDf['compound'].mean()  #finds the mean compound score of all the relevent Biden sentences

0.05151058823529413

In [14]:
trumpSentimentDf['compound'].mean() #finds the mean compound score of all the relevent Trump sentences

-0.035652448979591835

# Conclusions


As we can see, Trump's compound sentiment score from all of the Wikipedia page's sentences is overall negative. Meanwhile we can see that Biden's compound sentiment score from all of the Wikipedia page's sentences is overall positive. This can mean many different things. It can show that Trump is a more controversial president since Wikipedia shows a stronger negative connotation with his actions. It can also show that Wikipedia might not be completely impartial when stating the facts since there is a somewhat significant difference between the two president's scores. However I don't think this claim about Wikipedia is substantial since both compound scores are close to 0. This means that Wikipedia was mostly impartial when describing both presidents. Thus, using the data I gathered, I can assert that Biden's actions are described in a more positive manner when compared to Trump.