In [1]:
import os
import json
import spacy
import wptools
import requests
import wikipedia
import pandas as pd
from bs4 import BeautifulSoup

# Function to fetch the Nobel laureates from the Wikipedia page and do some cleaning
def fetch_nobel_laureates(url, category='not set'):
  try:
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    response.raise_for_status()
  except requests.exceptions.HTTPError as err:
    print(err)
    return None
  soup = BeautifulSoup(response.content, features='lxml')

  # Find the table containing the data
  tables = soup.find_all('table', {'class': 'wikitable'})

  nobel_laureates_df = pd.read_html(str(tables))
  nobel_laureates_df = nobel_laureates_df[0]
  nobel_laureates_df.columns = nobel_laureates_df.columns.str.lower() # Lowercase column names
  nobel_laureates_df = nobel_laureates_df.drop(columns=['image', 'rationale[c]', 'ref'])
  nobel_laureates_df = nobel_laureates_df.rename(columns={'laureate[a]': 'name', 'country[b]': 'country'})
  nobel_laureates_df['name'] = nobel_laureates_df['name'].str.replace(r'\s*\([^()]*\)', '', regex=True)
  # Drop duplicate name rows
  nobel_laureates_df = nobel_laureates_df.drop_duplicates(subset=['name'], keep='first')
  # Drop rows with missing values in the name column (e.g. empty rows)
  nobel_laureates_df = nobel_laureates_df.dropna(subset=['name'])
  # Excluding hard to parse rows
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains('Not awarded', case=False)]
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains('Skłodowska', case=False)]
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains(', 7th', case=False)]
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains("'ichirō", case=False)]
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains("van 't Hoff", case=False)]
  # Add a column for the category
  nobel_laureates_df['category'] = category
  return nobel_laureates_df.head(100)


In [2]:
# function to fetch biography of laureates from Wikipedia
def fetch_biography(name):
  search = wikipedia.search(name)[0]
  full_content = None
  try:
    page = wikipedia.page(search)
    full_content = page.content # we are getting all the content of the page for now
  except wikipedia.DisambiguationError as e:
    # Handle the disambiguation error
    print("DisambiguationError: The query resulted in multiple pages, choosing one.")
    specific_page = wikipedia.page(e.options[0])
    print("Picked:", specific_page.title)
  except Exception as e:
    # suggestions
    wp_page = wptools.page(name)
    query = wp_page.get_query()
    page_id = query.data["pageid"]
    page = wikipedia.page(pageid=page_id)
    full_content = page.content
  return full_content

In [3]:
# Function to fetch the RDF triples from DBpedia for each Nobel laureate where the name is the subject or object
def fetch_rdf_triples(name):
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
      {{
        ?subject ?predicate dbr:{name} .
        ?subject ?predicate ?object .
      }}
      UNION
      {{
        ?object ?predicate dbr:{name} .
        ?subject ?predicate ?object .
      }}
    }}
    LIMIT 100
    """




    url = "https://dbpedia.org/sparql"
    headers = {
        "Accept": "application/sparql-results+json"
    }
    params = {
        "query": query,
        "format": "json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()['results']['bindings']
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}


In [4]:
data_dir = os.getcwd()
data_dir = os.path.join(data_dir, "..", "..", "Data Directory")

# if the data directory does not exist, create it
if not os.path.exists(os.path.join(data_dir, 'physics_nobel_laureate')):
    os.makedirs(os.path.join(data_dir, 'physics_nobel_laureate'))

if not os.path.exists(os.path.join(data_dir, 'chemistry_nobel_laureate')):
    os.makedirs(os.path.join(data_dir, 'chemistry_nobel_laureate'))

# Fetch the Nobel laureates in Physics
physics_nobel_laureate = fetch_nobel_laureates('https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics', 'Physics')

# Apply the fetch_biography function to the name column of the DataFrame
physics_nobel_laureate['biography'] = physics_nobel_laureate['name'].apply(fetch_biography)

# Save each laureate's data to a /physics_nobel_laureate directory
for index, row in physics_nobel_laureate.iterrows():
    name = row['name']
    row_string = '\n'.join([f'{key}: {value}' for key, value in row.items()])
    # Save the metadata to a file
    with open(os.path.join(data_dir, 'physics_nobel_laureate', f'{name.replace(" ", "__")}.txt'), 'w', encoding='utf-8') as f:
        f.write(row_string)
    rdf_triples = fetch_rdf_triples(name) # Fetch RDF triples
    # Save the RDF triples to a json file
    with open(os.path.join(data_dir, 'physics_nobel_laureate', f'{name.replace(" ", "__")}.json'), 'w', encoding='utf-8') as f:
        json.dump(rdf_triples, f, indent=4)

  nobel_laureates_df = pd.read_html(str(tables))
en.wikipedia.org (query) Marie Curie
en.wikipedia.org (query) Marie Curie (&plcontinue=20408|0|Marie_M...
en.wikipedia.org (imageinfo) File:Marie Curie c. 1920s.jpg
Marie Curie (en) data
{
  aliases: <list(4)> Maria Salomea Skłodowska, Maria Skłodowska-Cu...
  assessments: <dict(10)> Medicine, Military history, Biography, P...
  description: Polish and French physicist and chemist (1867–1934)
  extext: <str(2778)> **Maria Salomea Skłodowska-Curie** (Polish: ...
  extract: <str(3406)> <p class="mw-empty-elt"></p><p><b>Maria Sal...
  image: <list(4)> {'kind': 'query-pageimage', 'file': 'File:Marie...
  label: Marie Curie
  length: 109,351
  links: <list(829)> 1903 Nobel Memorial Prize in Economic Science...
  modified: <dict(1)> page
  pageid: 20408
  random: A Woman Like You (Johnny Reid song)
  redirects: <list(34)> {'pageid': 20409, 'ns': 0, 'title': 'Marie...
  requests: <list(3)> query, query, imageinfo
  title: Marie Curie
  url: htt

In [5]:
# Fetch the Nobel laureates in Chemistry
chemistry_nobel_laureate = fetch_nobel_laureates('https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Chemistry', 'Chemistry')

# Apply the fetch_biography function to the name column of the DataFrame
chemistry_nobel_laureate['biography'] = chemistry_nobel_laureate['name'].apply(fetch_biography)

# Save each laureate's data to a /chemistry_nobel_laureate directory
for index, row in chemistry_nobel_laureate.iterrows():
    name = row['name']



    row_string = '\n'.join([f'{key}: {value}' for key, value in row.items()])
    # Save the metadata to a file
    with open(os.path.join(data_dir, 'chemistry_nobel_laureate', f'{name.replace(" ", "__")}.txt'), 'w', encoding='utf-8') as f:
        f.write(row_string)
    rdf_triples = fetch_rdf_triples(name) # Fetch RDF triples
    # Save the RDF triples to a json file
    with open(os.path.join(data_dir, 'chemistry_nobel_laureate', f'{name.replace(" ", "__")}.json'), 'w', encoding='utf-8') as f:
        json.dump(rdf_triples, f, indent=4)

  nobel_laureates_df = pd.read_html(str(tables))
en.wikipedia.org (query) Sir William Ramsay
en.wikipedia.org (imageinfo) File:William Ramsay.jpg
William Ramsay (en) data
{
  aliases: <list(1)> Sir William Ramsay
  assessments: <dict(3)> Biography, Scotland, Chemistry
  description: Scottish chemist
  extext: <str(600)> **Sir William Ramsay** (; 2 October 1852 – 23...
  extract: <str(1811)> <p class="mw-empty-elt"></p><p><b>Sir Willi...
  image: <list(2)> {'kind': 'query-pageimage', 'file': 'File:Willi...
  label: William Ramsay
  length: 17,170
  links: <list(300)> 1902 Coronation Honours, 1904 Nobel Memorial ...
  modified: <dict(1)> page
  pageid: 48187
  random: Hycan A06
  redirected: <list(1)> {'from': 'Sir William Ramsay', 'to': 'Will...
  redirects: <list(3)> {'pageid': 98014, 'ns': 0, 'title': 'Sir Wi...
  requests: <list(2)> query, imageinfo
  title: William Ramsay
  url: https://en.wikipedia.org/wiki/William_Ramsay
  url_raw: https://en.wikipedia.org/wiki/William_Ramsay?acti

DisambiguationError: The query resulted in multiple pages, choosing one.
Picked: Robert Robinson (Australian politician)


en.wikipedia.org (query) Frederick Sanger
en.wikipedia.org (query) Frederick Sanger (&plcontinue=63349|0|Th...
en.wikipedia.org (imageinfo) File:Frederick Sanger2.jpg
Frederick Sanger (en) data
{
  assessments: <dict(3)> Biography, Chemistry, History of Science
  description: British biochemist
  extext: <str(976)> **Frederick Sanger** (; 13 August 1918 – 19 N...
  extract: <str(1059)> <p class="mw-empty-elt"></p><p><b>Frederick...
  image: <list(4)> {'kind': 'query-pageimage', 'file': 'File:Frede...
  label: Frederick Sanger
  length: 51,937
  links: <list(548)> 1-fluoro-2,4-dinitrobenzene, 1958 Nobel Memor...
  modified: <dict(1)> page
  pageid: 63349
  random: Fengyan Township
  redirects: <list(3)> {'pageid': 469137, 'ns': 0, 'title': 'Fred ...
  requests: <list(3)> query, query, imageinfo
  title: Frederick Sanger
  url: https://en.wikipedia.org/wiki/Frederick_Sanger
  url_raw: https://en.wikipedia.org/wiki/Frederick_Sanger?action=raw
  watchers: 124
  wikibase: Q151564
  wikidata

In [6]:
# Concatenate the two DataFrames
physics_and_chemistry_nobel_laureate = pd.concat([physics_nobel_laureate, chemistry_nobel_laureate], axis=0)

In [7]:
nlp = spacy.load('en_core_web_sm')

# Function to clean the text using spaCy
def clean_text_spacy(text):
    if not text:
        return ''
    doc = nlp(text)
    # Lemmatize the text and remove the stopwords and non-alphabetic characters
    words = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(words)

# Apply the clean_text_spacy function to the biography column
physics_and_chemistry_nobel_laureate['clean_biography'] = physics_and_chemistry_nobel_laureate['biography'].apply(clean_text_spacy)

In [8]:
physics_and_chemistry_nobel_laureate.head(10)

Unnamed: 0,year,name,country,category,biography,clean_biography
0,1901,Wilhelm Röntgen,German Empire,Physics,Wilhelm Conrad Röntgen (; German pronunciation...,Wilhelm Conrad Röntgen german pronunciation ˈv...
1,1902,Hendrik Lorentz,Netherlands,Physics,Hendrik Antoon Lorentz (; 18 July 1853 – 4 Feb...,Hendrik Antoon Lorentz July February dutch phy...
2,1902,Pieter Zeeman,Netherlands,Physics,Pieter Zeeman (Dutch: [ˈzeːmɑn]; 25 May 1865 –...,Pieter Zeeman Dutch ˈzeːmɑn October dutch phys...
3,1903,Henri Becquerel,France,Physics,Antoine Henri Becquerel (; French pronunciatio...,Antoine Henri Becquerel french pronunciation b...
4,1903,Pierre Curie,France,Physics,"Pierre Curie ( KURE-ee, French: [pjɛʁ kyʁi]; 1...",Pierre Curie KURE ee French pjɛʁ kyʁi April fr...
5,1903,Marie Curie,Poland ( Russian Empire) France,Physics,Maria Salomea Skłodowska-Curie (Polish: [ˈmarj...,Maria Salomea Skłodowska Curie Polish ˈmarja s...
6,1904,Lord Rayleigh,United Kingdom,Physics,"John William Strutt, 3rd Baron Rayleigh, (; 1...",John William Strutt Baron Rayleigh November Ju...
7,1905,Philipp Lenard,German Empire,Physics,Philipp Eduard Anton von Lenard (German pronun...,Philipp Eduard Anton von Lenard german pronunc...
8,1906,J. J. Thomson,United Kingdom,Physics,Sir Joseph John Thomson (18 December 1856 – 3...,Sir Joseph John Thomson December August britis...
9,1907,Albert A. Michelson,United States,Physics,Albert Abraham Michelson FFRS FRSE (surname pr...,Albert Abraham Michelson FFRS FRSE surname pro...


In [9]:
# Save the DataFrame to a CSV file for easy access in other scripts
physics_and_chemistry_nobel_laureate.to_csv(os.path.join(data_dir, 'physics_and_chemistry_nobel_laureate.csv'), index=False)
