In [3]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install wikipedia
# !pip install pandas
# !pip install wptools

In [4]:
import os
import requests
from bs4 import BeautifulSoup
import wikipedia
import pandas as pd
import wptools

def fetch_nobel_laureates(url, category='not set'):
  response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find the table containing the data
  tables = soup.find_all('table', {'class': 'wikitable'})

  nobel_laureates_df = pd.read_html(str(tables))
  nobel_laureates_df = nobel_laureates_df[0]
  nobel_laureates_df.columns = nobel_laureates_df.columns.str.lower() # Lowercase column names
  nobel_laureates_df = nobel_laureates_df.drop(columns=['image', 'rationale[c]', 'ref'])
  nobel_laureates_df = nobel_laureates_df.rename(columns={'laureate[a]': 'name', 'country[b]': 'country'})
  nobel_laureates_df['name'] = nobel_laureates_df['name'].str.replace(r'\s*\([^()]*\)', '', regex=True)
  # Drop duplicate name rows
  nobel_laureates_df = nobel_laureates_df.drop_duplicates(subset=['name'], keep='first')
  # Drop rows with missing values in the name column (e.g. empty rows)
  nobel_laureates_df = nobel_laureates_df.dropna(subset=['name'])
  # Drop rows with text containing 'Not awarded' in the name column
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains('Not awarded', case=False)]
  nobel_laureates_df = nobel_laureates_df[~nobel_laureates_df['name'].str.contains('Skłodowska', case=False)]

  # Add a column for the category
  nobel_laureates_df['category'] = category
  return nobel_laureates_df.head(100)


In [5]:
# Create function to fetch biography of laureates from Wikipedia
def fetch_biography(name):
  search = wikipedia.search(name)[0]
  try:
    page = wikipedia.page(search)
    full_content = page.content # we are getting all the content of the page for now
  except:
    # suggestions
    wp_page = wptools.page(name)
    query = wp_page.get_query()
    page_id = query.data["pageid"]
    page = wikipedia.page(pageid=page_id)
    full_content = page.content
  return full_content

In [6]:
def fetch_rdf_triples(name):
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
        ?subject dbo:wikiPageWikiLink dbr:{name} .
        ?subject ?predicate ?object .
    }}
    LIMIT 100
    """




    url = "https://dbpedia.org/sparql"
    headers = {
        "Accept": "application/sparql-results+json"
    }
    params = {
        "query": query,
        "format": "json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()['results']['bindings']
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}

In [7]:
# if the data directory does not exist, create it
if not os.path.exists('physics_nobel_laureate'):
    os.makedirs('physics_nobel_laureate')

if not os.path.exists('chemistry_nobel_laureate'):
    os.makedirs('chemistry_nobel_laureate')

physics_nobel_laureate = fetch_nobel_laureates('https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics', 'Physics')

# Apply the fetch_biography function to the name column of the DataFrame
physics_nobel_laureate['biography'] = physics_nobel_laureate['name'].apply(fetch_biography)

# Save each laureate's data to a /data/name_physics_nobel_laureate.txt file
for index, row in physics_nobel_laureate.iterrows():
    name = row['name']
    row_string = '\n'.join([f'{key}: {value}' for key, value in row.items()])
    # Save the data to a file
    with open(f'physics_nobel_laureate/{name.replace(" ", "")}_.txt', 'w') as file:
        file.write(row_string)
    rdf_triples = fetch_rdf_triples(name) # Fetch RDF triples
    # Save the RDF triples to a json file
    with open(f'physics_nobel_laureate/{name.replace(" ", "")}.json', 'w') as file:
        file.write(str(rdf_triples))

print(len(physics_nobel_laureate))
print(physics_nobel_laureate.head(10))

  nobel_laureates_df = pd.read_html(str(tables))
en.wikipedia.org (query) Marie Curie
en.wikipedia.org (query) Marie Curie (&plcontinue=20408|0|Marie_M...
en.wikipedia.org (imageinfo) File:Marie Curie c. 1920s.jpg
Marie Curie (en) data
{
  aliases: <list(4)> Maria Salomea Skłodowska, Maria Skłodowska-Cu...
  assessments: <dict(10)> Medicine, Military history, Biography, P...
  description: Polish and French physicist and chemist (1867–1934)
  extext: <str(2775)> **Maria Salomea Skłodowska-Curie** (Polish: ...
  extract: <str(3416)> <p class="mw-empty-elt"></p><p><b>Maria Sal...
  image: <list(4)> {'kind': 'query-pageimage', 'file': 'File:Marie...
  label: Marie Curie
  length: 109,315
  links: <list(828)> 1903 Nobel Memorial Prize in Economic Science...
  modified: <dict(1)> page
  pageid: 20408
  random: Fómeque Formation
  redirects: <list(34)> {'pageid': 20409, 'ns': 0, 'title': 'Marie...
  requests: <list(3)> query, query, imageinfo
  title: Marie Curie
  url: https://en.wikipedia.

100
   year                 name                           country category  \
0  1901      Wilhelm Röntgen                     German Empire  Physics   
1  1902      Hendrik Lorentz                       Netherlands  Physics   
2  1902        Pieter Zeeman                       Netherlands  Physics   
3  1903      Henri Becquerel                            France  Physics   
4  1903         Pierre Curie                            France  Physics   
5  1903          Marie Curie  Poland ( Russian Empire)  France  Physics   
6  1904        Lord Rayleigh                    United Kingdom  Physics   
7  1905       Philipp Lenard                     German Empire  Physics   
8  1906        J. J. Thomson                    United Kingdom  Physics   
9  1907  Albert A. Michelson                     United States  Physics   

                                           biography  
0  Wilhelm Conrad Röntgen (; German pronunciation...  
1  Hendrik Antoon Lorentz (; 18 July 1853 – 4 Feb...  
2  Pi

In [8]:
chemistry_nobel_laureate = fetch_nobel_laureates('https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Chemistry', 'Chemistry')

# Apply the fetch_biography function to the name column of the DataFrame
chemistry_nobel_laureate['biography'] = chemistry_nobel_laureate['name'].apply(fetch_biography)

# Save each laureate's data to a /data/name_chemistry_nobel_laureate.txt file
for index, row in chemistry_nobel_laureate.iterrows():
    name = row['name']



    row_string = '\n'.join([f'{key}: {value}' for key, value in row.items()])
    # Save the data to a file
    with open(f'chemistry_nobel_laureate/{name.replace(" ", "")}.txt', 'w') as file:
        file.write(row_string)
    rdf_triples = fetch_rdf_triples(name) # Fetch RDF triples
    # Save the RDF triples to a json file
    with open(f'chemistry_nobel_laureate/{name.replace(" ", "")}.json', 'w') as file:
        file.write(str(rdf_triples))

print(len(chemistry_nobel_laureate))
print(chemistry_nobel_laureate.head(20))

  nobel_laureates_df = pd.read_html(str(tables))
en.wikipedia.org (query) Sir William Ramsay
en.wikipedia.org (imageinfo) File:William Ramsay.jpg
William Ramsay (en) data
{
  aliases: <list(1)> Sir William Ramsay
  assessments: <dict(3)> Biography, Scotland, Chemistry
  description: Scottish chemist
  extext: <str(600)> **Sir William Ramsay** (; 2 October 1852 – 23...
  extract: <str(660)> <p class="mw-empty-elt"></p><p><b>Sir Willia...
  image: <list(2)> {'kind': 'query-pageimage', 'file': 'File:Willi...
  label: William Ramsay
  length: 17,024
  links: <list(299)> 1902 Coronation Honours, 1904 Nobel Memorial ...
  modified: <dict(1)> page
  pageid: 48187
  random: Urazbakhty
  redirected: <list(1)> {'from': 'Sir William Ramsay', 'to': 'Will...
  redirects: <list(3)> {'pageid': 98014, 'ns': 0, 'title': 'Sir Wi...
  requests: <list(2)> query, imageinfo
  title: William Ramsay
  url: https://en.wikipedia.org/wiki/William_Ramsay
  url_raw: https://en.wikipedia.org/wiki/William_Ramsay?act

100
    year                          name                         country  \
0   1901  Jacobus Henricus van 't Hoff                     Netherlands   
1   1902          Hermann Emil Fischer                         Germany   
2   1903       Svante August Arrhenius                          Sweden   
3   1904            Sir William Ramsay                  United Kingdom   
4   1905              Adolf von Baeyer                         Germany   
5   1906                 Henri Moissan                          France   
6   1907                Eduard Buchner                         Germany   
7   1908             Ernest Rutherford     United Kingdom  New Zealand   
8   1909               Wilhelm Ostwald                         Germany   
9   1910                  Otto Wallach                         Germany   
11  1912               Victor Grignard                          France   
12  1912                 Paul Sabatier                          France   
13  1913                 Alfred We