In [29]:
import numpy as np
import urllib.request as request
import urllib
import json
import pandas as pd

def download_article_revisions(article_name: str):
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles=" + urllib.parse.quote(article_name)
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"

    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    wikiresponse = request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
        # page content
    for _, value in json.loads(wikitext)["query"]["pages"].items():
         return value["revisions"][0]["*"]


def download_article_extracts(article_name: str):
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles=" + urllib.parse.quote_plus(article_name)
    content = "prop=extracts"
    exlimit = "exlimit=1"
    explaintext = "explaintext=1"
    dataformat ="format=json"

    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    wikiresponse = request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
        # page content
    for _, value in json.loads(wikitext)["query"]["pages"].items():
            return value ["extract"] 

ids = ["Donald_Trump", "Barack_Obama", "Joe_Biden"]
wiki_ids = []
raw_texts = []
isTalk = []
for id in ids:
     talk_id = f"Talk:{id}"

     wiki_ids.append(id)
     isTalk.append(False)
     raw_texts.append(download_article_extracts(id))

     wiki_ids.append(id)
     isTalk.append(True)
     raw_texts.append(download_article_extracts(talk_id))

wiki_pages = pd.DataFrame({"id": wiki_ids, "is_talk": isTalk, "raw_content": raw_texts})
wiki_pages.head()

Unnamed: 0,id,is_talk,raw_content
0,Donald_Trump,False,"<p class=""mw-empty-elt"">\n\n\n</p>\n<link rel=..."
1,Donald_Trump,True,"<link rel=""mw-deduplicated-inline-style"" href=..."
2,Barack_Obama,False,"<link rel=""mw-deduplicated-inline-style"" href=..."
3,Barack_Obama,True,"<link rel=""mw-deduplicated-inline-style"" href=..."
4,Joe_Biden,False,"<p class=""mw-empty-elt"">\n\n\n</p>\n<p><b>Jose..."


In [30]:
import string
from nltk import word_tokenize, Text
from nltk.stem import WordNetLemmatizer

tokens = []
wnl = WordNetLemmatizer()
for raw in wiki_pages.raw_content:
    # tokenization and lemmatization of words (filtering out punctuation)
    processed = list(filter(lambda token: wnl.lemmatize(token.lower()) not in string.punctuation, word_tokenize(raw)))
    tokens.append(processed)

In [31]:
wiki_pages["tokens"] = tokens
labMT = pd.read_csv("../assignments/labMT.txt", sep="\t")
# to facilitate happiness_average value lookup
labMT.set_index("word", inplace=True)
# labMT.head()

In [32]:
def sentiment(tokens):
    if(len(tokens) == 0):
        return
    freq = FreqDist(tokens)

    # filter for the vocabulary we can evaluate with LabMT
    vocab = list(filter(lambda word: word in labMT.index, np.unique(tokens)))

    # array of each token's average happiness weighted by the token's frequency
    weighted_happiness = np.fromiter((freq[word] * labMT.loc[word].happiness_average for word in vocab), dtype=float)
    # each token's frequency
    word_frequencies = np.fromiter((freq[word] for word in vocab), dtype=float)
    return np.sum(weighted_happiness) / np.sum(word_frequencies)

In [37]:
from nltk import FreqDist

sentiments = []
for token_list in wiki_pages.tokens:
    # compute sentiment for individual rapper wiki page
    sentiment_value = sentiment(token_list)
    if(sentiment_value):
        sentiments.append(sentiment_value)

wiki_pages["sentiment"] = sentiments
# use the rappers' name as index
# wiki_pages.set_index("id", inplace=True)
wiki_pages.head(10)

Unnamed: 0_level_0,is_talk,raw_content,tokens,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Donald_Trump,False,"<p class=""mw-empty-elt"">\n\n\n</p>\n<link rel=...","[p, class=, '', mw-empty-elt, '', /p, link, re...",5.224624
Donald_Trump,True,"<link rel=""mw-deduplicated-inline-style"" href=...","[link, rel=, '', mw-deduplicated-inline-style,...",5.413483
Barack_Obama,False,"<link rel=""mw-deduplicated-inline-style"" href=...","[link, rel=, '', mw-deduplicated-inline-style,...",5.321338
Barack_Obama,True,"<link rel=""mw-deduplicated-inline-style"" href=...","[link, rel=, '', mw-deduplicated-inline-style,...",5.377172
Joe_Biden,False,"<p class=""mw-empty-elt"">\n\n\n</p>\n<p><b>Jose...","[p, class=, '', mw-empty-elt, '', /p, p, b, Jo...",5.269958
Joe_Biden,True,"<p><br></p>\n<link rel=""mw-deduplicated-inline...","[p, br, /p, link, rel=, '', mw-deduplicated-in...",5.396449


In [42]:
talks = wiki_pages[wiki_pages.is_talk == True]
articles = wiki_pages[wiki_pages.is_talk == False]

for id in talks.index:
    print(f"Token number in {id} talk page: {len(talks.loc[id].tokens)}")
    print(f"Sentiment in {id} talk page:", talks.loc[id].sentiment)
    print()

Token number in Donald_Trump talk page: 17743
Sentiment in Donald_Trump talk page: 5.4134830371567055

Token number in Barack_Obama talk page: 5375
Sentiment in Barack_Obama talk page: 5.377172330097087

Token number in Joe_Biden talk page: 9231
Sentiment in Joe_Biden talk page: 5.396449351559761



In [43]:
for id in articles.index:
    print(f"Token number in {id} article page: {len(articles.loc[id].tokens)}")
    print(f"Sentiment in {id} article page:", articles.loc[id].sentiment)
    print()

Token number in Donald_Trump article page: 20994
Sentiment in Donald_Trump article page: 5.224624036495202

Token number in Barack_Obama article page: 14778
Sentiment in Barack_Obama article page: 5.321338260375213

Token number in Joe_Biden article page: 13929
Sentiment in Joe_Biden article page: 5.2699580468476865

