# Activity 3

## 3.1 Sub-activity: Loading and pre-processing of text data

### Task 1:

In [1]:
# To figure out how to get the Wikidata and Wikipedia APIs working I used code from these sources:
# https://www.jcchouinard.com/wikidata-api-python/
# https://www.wikidata.org/wiki/Wikidata:REST_API
# https://agg-shashank.medium.com/an-introduction-to-using-wikidata-apis-a678ee6d2968
# https://www.wikidata.org/wiki/Wikidata:REST_API
# https://wikipedia-api.readthedocs.io/en/latest/wikipediaapi/api.html

import requests

def get_turing_award_recipients():
    wikidata_api_url = "https://www.wikidata.org/w/api.php" # base url
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'backlinks',
        'bltitle': 'Q185667',  # wikidata item for turing award
        'blnamespace': 0,
        'blfilterredir': 'nonredirects',
        'bllimit': 500  # limit
    }

    response = requests.get(url=wikidata_api_url, params=params)
    backlinks = response.json()['query']['backlinks'] # ids for recipients are found in 'backlinks' of response

    if not backlinks:
        print("No backlinks found.")
        return

    recipients = [] # declare empty list for recipients
    for backlink in backlinks: # get each recipient
        recipient_id = backlink['title']
        recipients.append(recipient_id) #also gets ids of other links in table
    return recipients

turing_award_recipients = get_turing_award_recipients() # get recipients
for recipient_id in turing_award_recipients:
   print(f"Recipient ID: {recipient_id}")


Recipient ID: Q80
Recipient ID: Q8556
Recipient ID: Q9602
Recipient ID: Q11609
Recipient ID: Q17457
Recipient ID: Q45575
Recipient ID: Q49823
Recipient ID: Q62843
Recipient ID: Q62857
Recipient ID: Q62861
Recipient ID: Q62866
Recipient ID: Q62870
Recipient ID: Q62874
Recipient ID: Q62877
Recipient ID: Q62888
Recipient ID: Q62894
Recipient ID: Q62898
Recipient ID: Q92596
Recipient ID: Q92602
Recipient ID: Q92604
Recipient ID: Q92606
Recipient ID: Q92609
Recipient ID: Q92612
Recipient ID: Q92613
Recipient ID: Q92614
Recipient ID: Q92618
Recipient ID: Q92619
Recipient ID: Q92625
Recipient ID: Q92626
Recipient ID: Q92628
Recipient ID: Q92629
Recipient ID: Q92632
Recipient ID: Q92638
Recipient ID: Q92641
Recipient ID: Q92643
Recipient ID: Q92644
Recipient ID: Q92649
Recipient ID: Q92670
Recipient ID: Q92739
Recipient ID: Q92742
Recipient ID: Q92743
Recipient ID: Q92744
Recipient ID: Q92745
Recipient ID: Q92746
Recipient ID: Q92758
Recipient ID: Q92766
Recipient ID: Q92781
Recipient ID: Q927

### Task 2

In [2]:
def get_wikipedia_content(wikidata_id):
    wikidata_api_url = "https://www.wikidata.org/w/api.php" # base url
    params = { # parameters
        'action': 'wbgetentities',
        'format': 'json', # returns json format
        'ids': wikidata_id, #input
        'props': 'sitelinks' #
    }
    response = requests.get(url=wikidata_api_url, params=params) # fetch request
    wikidata_data = response.json() # convert to json
    sitelinks = wikidata_data['entities'][wikidata_id]['sitelinks']

    if 'enwiki' not in sitelinks: # make sure it's english wikipedia
        return None

    wikipedia_title = sitelinks['enwiki']['title'] # use the wikipedia api to get wikipedia page content
    wikipedia_api_url = "https://en.wikipedia.org/w/api.php"
    params = { # parameters
        'action': 'query',
        'format': 'json', # returns json format
        'titles': wikipedia_title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    response = requests.get(url=wikipedia_api_url, params=params) # call
    wikipedia_data = response.json()
    pages = wikipedia_data['query']['pages']
    page_id = next(iter(pages)) # get page content
    content = pages[page_id]['revisions'][0]['*'] # revision is a version of an article (with [0][*] being all the content of the most recent)
    return content # returns page

### Task 3:

In [3]:
# My solution returns the intro, but it's a very short one. My attempt tried to use BeautifulSoup to parse the information, however I could not find the
# HTML to get the intro from the returned Wikipedia article. The intro length impacts how well the NLP later on works. Regex is instead used to get the intros.

# This task can take a while to run.

import requests
from bs4 import BeautifulSoup
import re

def get_wikipedia_intro(wikipedia_content):
    #soup = BeautifulSoup(wikipedia_content, 'html.parser') #parse the data with beautifulsoup
    short_description_match = re.search(r'\{\{short description\|(.+?)\}\}', wikipedia_content) # find the content within double curly braces and containing "short description". regex
    if short_description_match: # extract the short description if the pattern is found
        short_description = short_description_match.group(1) # assign matched text 
    else:
        short_description = None
        
    return short_description

def get_wikidata_info(recipient_id): # function takes in wikidata entity value (of each award winner)
    wikidata_api_url = "https://www.wikidata.org/w/api.php" # base url
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'ids': recipient_id,
        'props': 'claims|labels' # claims are information about a person, labels are metadata info (to get english content)
    }

    response = requests.get(url=wikidata_api_url, params=params) # call wikidata api
    entity_info = response.json().get('entities', {}).get(recipient_id, {}) # parses response and puts it into a dictionary (with entities as a key).
    return entity_info # return

def get_entity_name(entity_id): # function that gets the name of the wikidata entity from english wikipedia
    entity_info = get_wikidata_info(entity_id)
    return entity_info.get('labels', {}).get('en', {}).get('value', None)

def get_claim_value(claims, property_id): # again, a claim is information about a person. this function takes in a claims for a person, and IDs representing the specific info wanted
    claim_list = claims.get(property_id, [])
    if claim_list:
        mainsnak = claim_list[0].get('mainsnak', {}) # mainsnak contains the data value
        datavalue = mainsnak.get('datavalue', {}) # part of the mainsnak that holds the valye
        value = datavalue.get('value') # the value itself
        if value and 'id' in value: # if the value has an id, get that, otherwise return the value itself
            return get_entity_name(value['id'])
        return value
    return None

award_winners = [] # declare empty list of award winners

for recipient_id in turing_award_recipients: #iterate through list of wikidata entities of turing award winners
    wikipedia_content = get_wikipedia_content(recipient_id) # get wikipedia page
    if wikipedia_content:
        wikipedia_intro = get_wikipedia_intro(wikipedia_content) # get introduction from wikipedia
        recipient_info = get_wikidata_info(recipient_id) # get wikidata info
        award_winner = { # create dictionary for award winner
            'name': get_entity_name(recipient_id), # call function to get recipient name
            'intro': wikipedia_intro,
            'gender': get_claim_value(recipient_info.get('claims', {}), 'P21'), # call function to get info about person. each p value corresponds to what's needed
            'birth_date': get_claim_value(recipient_info.get('claims', {}), 'P569'), # birth date
            'birth_place': get_claim_value(recipient_info.get('claims', {}), 'P19'), # birth place
            'employer': get_claim_value(recipient_info.get('claims', {}), 'P108'), # employer
            'educated_at': get_claim_value(recipient_info.get('claims', {}), 'P69') # education/uni
        }
        award_winners.append(award_winner) #add the award winner's information to the list

for winner_info in award_winners: # print award winners
    print("Award Winner Information:")
    print(winner_info)
    print("\n")

Award Winner Information:
{'name': 'Tim Berners-Lee', 'intro': 'English computer scientist, inventor of the World Wide Web (born 1955)', 'gender': 'male', 'birth_date': {'time': '+1955-06-08T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 11, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'birth_place': 'London', 'employer': 'World Wide Web Consortium', 'educated_at': "The Queen's College"}


Award Winner Information:
{'name': 'Edsger W. Dijkstra', 'intro': 'Dutch computer scientist (1930–2002)', 'gender': 'male', 'birth_date': {'time': '+1930-05-11T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 11, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'birth_place': 'Rotterdam', 'employer': 'University of Texas at Austin', 'educated_at': 'Leiden University'}


Award Winner Information:
{'name': 'Frances E. Allen', 'intro': None, 'gender': 'female', 'birth_date': {'time': '+1932-08-04T00:00:00Z', 'timezone': 0, 'before': 0, 'after

### Task 4:

In [4]:
names = [winner['name'] for winner in award_winners]# extract names from award_winners
sorted_names = sorted(names) # sort names in alphabetical order
for name in sorted_names: # print
    print(name)

Adi Shamir
Alan Kay
Alan Perlis
Alfred Aho
Allen Newell
Amir Pnueli
Andrew Yao
Association for Computing Machinery
Barbara Liskov
Bob Kahn
Butler Lampson
Category:Turing Award laureates
Charles Bachman
Charles P. Thacker
Dana Scott
David A. Patterson
Dennis M. Ritchie
Donald Knuth
Douglas Engelbart
E. Allen Emerson
Edgar F. Codd
Edmund M. Clarke
Edsger W. Dijkstra
Edward Feigenbaum
Edwin Catmull
Fernando J. Corbató
Frances E. Allen
Fred Brooks
Geoffrey Hinton
Herbert Simon
Iosif Sifakis
Ivan Sutherland
Jack Dongarra
James H. Wilkinson
Jeffrey David Ullman
Jim Gray
John Backus
John Cocke
John Edward Hopcroft
John L. Hennessy
John McCarthy
Judea Pearl
Juris Hartmanis
Ken Thompson
Kenneth E. Iverson
Kristen Nygaard
Leonard Adleman
Leslie Lamport
Leslie Valiant
Manuel Blum
Martin Edward Hellman
Marvin Minsky
Maurice Wilkes
Michael O. Rabin
Michael Stonebraker
Niklaus Wirth
Ole-Johan Dahl
Pat Hanrahan
Peter Naur
Raj Reddy
Richard E. Stearns
Richard Hamming
Richard M. Karp
Robert Metcalfe
Ro

### Task 5:

In [5]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter

stopwords = set(stopwords.words('english')) # stop words

def count_words(text): # function to count the number of words in text
    return len(word_tokenize(str(text))) # word tokenize function from nltk

def count_sentences(text): # function to count sentences in a paragraph
    return len(sent_tokenize(str(text))) # sentance tokenize function from nltk

def count_paragraphs(text): # function to count number of paragraphs in a text
    return len(str(text).split('\n\n')) # split on new line

def get_top_words(text): # function to get top words
    words = word_tokenize(str(text)) #tokenize
    word_counts = Counter(words) # Counter iterates over words list 
    top_words = word_counts.most_common(10) # gets most comon words

    return top_words

def preprocess_text(text): # function to preprocess text
    words = word_tokenize(str(text)) # tokenize
    filtered_words = [ # remove stopwords and punctuation
        word.lower()
        for word in words
        if word.lower() not in stopwords and word.lower() not in string.punctuation # if not stopword or punctuation, include in filtered words list
    ]
    return filtered_words #return list

df = pd.DataFrame(award_winners) # convert to dataframe
df['count_words'] = df['intro'].apply(count_words) # add count_words column
df['count_sentences'] = df['intro'].apply(count_sentences) # add count_sentences column
df['count_paragraphs'] = df['intro'].apply(count_paragraphs) # add count_paragraph column
df['common_words'] = df['intro'].apply(get_top_words) # add common_words common (not preprocessed)
df['common_words_after_preprocessing'] = df['intro'].apply(preprocess_text) # add pre processed common words
display(df.head(10)) # display top 10


Unnamed: 0,name,intro,gender,birth_date,birth_place,employer,educated_at,count_words,count_sentences,count_paragraphs,common_words,common_words_after_preprocessing
0,Tim Berners-Lee,"English computer scientist, inventor of the Wo...",male,"{'time': '+1955-06-08T00:00:00Z', 'timezone': ...",London,World Wide Web Consortium,The Queen's College,14,1,1,"[(English, 1), (computer, 1), (scientist, 1), ...","[english, computer, scientist, inventor, world..."
1,Edsger W. Dijkstra,Dutch computer scientist (1930–2002),male,"{'time': '+1930-05-11T00:00:00Z', 'timezone': ...",Rotterdam,University of Texas at Austin,Leiden University,6,1,1,"[(Dutch, 1), (computer, 1), (scientist, 1), ((...","[dutch, computer, scientist, 1930–2002]"
2,Frances E. Allen,,female,"{'time': '+1932-08-04T00:00:00Z', 'timezone': ...",Peru,IBM,State University of New York at Albany,1,1,1,"[(None, 1)]",[none]
3,Shafrira Goldwasser,Israeli American computer scientist,female,"{'time': '+1958-11-14T00:00:00Z', 'timezone': ...",New York City,Massachusetts Institute of Technology,Carnegie Mellon University,4,1,1,"[(Israeli, 1), (American, 1), (computer, 1), (...","[israeli, american, computer, scientist]"
4,Donald Knuth,American computer scientist and mathematician ...,male,"{'time': '+1938-01-10T00:00:00Z', 'timezone': ...",Milwaukee,Stanford University,Case Western Reserve University,9,1,1,"[(American, 1), (computer, 1), (scientist, 1),...","[american, computer, scientist, mathematician,..."
5,Dennis M. Ritchie,,male,"{'time': '+1941-09-09T00:00:00Z', 'timezone': ...",Bronxville,Bell Labs,Harvard University,1,1,1,"[(None, 1)]",[none]
6,Dana Scott,American logician (born 1932),male,"{'time': '+1932-10-11T00:00:00Z', 'timezone': ...",Berkeley,"University of California, Berkeley",Princeton University,6,1,1,"[(American, 1), (logician, 1), ((, 1), (born, ...","[american, logician, born, 1932]"
7,Bob Kahn,,male,"{'time': '+1938-12-23T00:00:00Z', 'timezone': ...",New York City,Massachusetts Institute of Technology,City College of New York,1,1,1,"[(None, 1)]",[none]
8,Maurice Wilkes,British computer scientist (1913–2010),male,"{'time': '+1913-06-26T00:00:00Z', 'timezone': ...",Dudley,Digital Equipment Corporation,St John's College,6,1,1,"[(British, 1), (computer, 1), (scientist, 1), ...","[british, computer, scientist, 1913–2010]"
9,Alan Perlis,,male,"{'time': '+1922-04-01T00:00:00Z', 'timezone': ...",Pittsburgh,Yale University,Carnegie Mellon University,1,1,1,"[(None, 1)]",[none]


## 3.2 Sub-activity: Applying NLP operations on the corpus

### Task 6

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

intro_words = df['intro'].tolist()  # create list of all intros
stopwords_set = set(stopwords.words('english'))  # stopwords
porter_stemmer = PorterStemmer()  # create PorterStemmer instance

def preprocess_intro(text):  # function to preprocess text
    words = word_tokenize(str(text))
    filtered_words = []
    for word in words: #iterate through each word
        lowercase_word = word.lower() # make lower
        if lowercase_word not in stopwords_set and lowercase_word not in string.punctuation: # if not in stop words or punctuation, add to filtered words list
            filtered_words.append(lowercase_word)
    return filtered_words # preprocessing (part (a))

preprocessed_intro_words = []
for intro in intro_words:  # apply pre processing to each word in intro
    processed_intro = preprocess_intro(intro)
    if processed_intro != ['none']: # get rid of 'none' values (issue with how long my intros are)
        preprocessed_intro_words.append(processed_intro)

all_words = []
for sublist in preprocessed_intro_words: # loop to get every word in dict in to one list
    if isinstance(sublist, list): # if list within list
        for word in sublist:
            all_words.append(word) # append to list

unique_words_count = len(set(all_words)) # get each unique word (set)
print(f"Number of unique words before stemming: {unique_words_count}") # part (b)

stemmed_intro_words = [] # declare list for stemmed words
for word in all_words: # iterate through each word
    stemmed_word = porter_stemmer.stem(word) # apply porter stemmer
    stemmed_intro_words.append(stemmed_word) # append to list
    
unique_stemmed_words_count = len(set(stemmed_intro_words)) # get each unique word (set)
print(f"Number of unique words after stemming: {unique_stemmed_words_count}") # part (c)
# Again the results being the same may be an issue with the intro I'm getting from the wikipedia API.

Number of unique words before stemming: 47
Number of unique words after stemming: 47


### Task 7

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

intro_words = df['intro'].tolist()  # create list of all intros
stopwords_set = set(stopwords.words('english'))  # stopwords

def preprocess_intro(text):  # function to preprocess text
    words = word_tokenize(str(text))
    filtered_words = []
    for word in words: #iterate through each word
        lowercase_word = word.lower() # make lower
        if lowercase_word not in stopwords_set and lowercase_word not in string.punctuation: # if not in stop words or punctuation, add to filtered words list
            filtered_words.append(lowercase_word)
    return filtered_words # part (a)

preprocessed_intro_words = []
for intro in intro_words:  # apply pre processing to each word in intro
    processed_intro = preprocess_intro(intro)
    if processed_intro != ['none']: # get rid of 'none' values (issue with how long my intros are)
        preprocessed_intro_words.append(processed_intro)

all_words = []
for sublist in preprocessed_intro_words: # loop to get every word in dict in to one list
    if isinstance(sublist, list): # if list within list
        for word in sublist:
            all_words.append(word) # append to list

unique_words_count = len(set(all_words)) # get unique words again
print(f"Number of unique words before stemming: {unique_words_count}") # part (b)

snowball_stemmer = SnowballStemmer("english") # get snowball stemmer
stemmed_intro_words_snowball = [] # create list of stemmed intro words
for word in all_words:
    stemmed_word = snowball_stemmer.stem(word) # stem
    stemmed_intro_words_snowball.append(stemmed_word) # add to list

unique_stemmed_words_count_snowball = len(set(stemmed_intro_words_snowball)) # get unique words
print(f"Number of unique words after stemming with SnowballStemmer: {unique_stemmed_words_count_snowball}") #part (c)


Number of unique words before stemming: 47
Number of unique words after stemming with SnowballStemmer: 47


### Task 8

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet') # uncomment if wordnet needs downloading
import string

intro_words = df['intro'].tolist()  # create list of all intros
stopwords_set = set(stopwords.words('english'))  # stopwords
lemmatizer = WordNetLemmatizer()  # create WordNetLemmatizer instance

def preprocess_intro(text):  # function to preprocess text
    words = word_tokenize(str(text))
    filtered_words = []
    for word in words: #iterate through each word
        lowercase_word = word.lower() # make lower
        if lowercase_word not in stopwords_set and lowercase_word not in string.punctuation: # if not in stop words or punctuation, add to filtered words list
            filtered_words.append(lowercase_word)
    return filtered_words # part (a)

preprocessed_intro_words = []
for intro in intro_words:  # apply pre processing to each word in intro
    processed_intro = preprocess_intro(intro)
    if processed_intro != ['none']: # get rid of 'none' values (issue with how long my intros are)
        preprocessed_intro_words.append(processed_intro)

all_words = []
for sublist in preprocessed_intro_words: # loop to get every word in dict in to one list
    if isinstance(sublist, list): # if list within list
        for word in sublist:
            all_words.append(word) # append to list

unique_words_count = len(set(all_words))
print(f"Number of unique words before lemmatization: {unique_words_count}") # part (b)

lemmatized_intro_words = []
for word in all_words: # iterate through each word
    lemmatized_word = lemmatizer.lemmatize(word) # use lemmatizer
    lemmatized_intro_words.append(lemmatized_word)
unique_lemmatized_words_count = len(set(lemmatized_intro_words)) # get unique words (set)
print(f"Number of unique words after lemmatization: {unique_lemmatized_words_count}") # part (c)


Number of unique words before lemmatization: 47
Number of unique words after lemmatization: 47


In [None]:
# This was as far as I managed to get for this activity.