In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
!pip install pymediawiki

In [3]:
from bs4 import BeautifulSoup
from urllib import request
from itertools import chain
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import networkx as nx
from mediawiki import MediaWiki

In [20]:
def philosopher_influenced(philosopher_url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?influenced
        WHERE {{ <http://dbpedia.org/resource/{x}> dbo:influenced ?influenced }}
    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    direct_results = [result['influenced']['value'] for result in results['results']['bindings']]

    sparql.setQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?influenced
    WHERE {{ <http://dbpedia.org/resource/{x}> ^dbo:influencedBy ?influenced }}
    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    indirect_results = [result['influenced']['value'] for result in results['results']['bindings']]
    results = list(set(direct_results+indirect_results))
    return [result.split('/')[-1] for result in results]

def philosopher_influenced_by(philosopher_url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?influencedBy
        WHERE {{ <http://dbpedia.org/resource/{x}> dbo:influencedBy ?influencedBy }}
    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    direct_results = [result['influencedBy']['value'] for result in results['results']['bindings']]

    sparql.setQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?influencedBy
    WHERE {{ <http://dbpedia.org/resource/{x}> ^dbo:influenced ?influencedBy }}
    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    indirect_results = [result['influencedBy']['value'] for result in results['results']['bindings']]
    results = list(set(direct_results+indirect_results))
    return [result.split('/')[-1] for result in results]

def get_name(philosopher_url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
                    PREFIX dbo: <http://dbpedia.org/ontology/>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                    SELECT ?name
                    WHERE {{ <http://dbpedia.org/resource/{x}> dbp:name ?name}}
                    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    return [result['name']['value'] for result in results['results']['bindings']]

def get_title(url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
                    PREFIX dbo: <http://dbpedia.org/ontology/>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                    SELECT ?title
                    WHERE {{ <http://dbpedia.org/resource/{x}> dbp:title ?title}}
                    """.format(x=url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    return [result['title']['value'] for result in results['results']['bindings']]

def get_notable_ideas(philosopher_url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
                    PREFIX dbo: <http://dbpedia.org/ontology/>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                    SELECT ?idea
                    WHERE {{ <http://dbpedia.org/resource/{x}> dbo:notableIdea ?idea}}
                    """.format(x=philosopher_url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    ideas =  [result['idea']['value'] for result in results['results']['bindings']]
    return [idea.split('/')[-1] for idea in ideas]

def get_abstract(url, endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
                    PREFIX dbo: <http://dbpedia.org/ontology/>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                    SELECT ?abstract
                    WHERE {{ <http://dbpedia.org/resource/{x}> dbo:abstract ?abstract}}
                    """.format(x=url))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results['results']['bindings']

def return_en_result(abstract_list):
    try:
        to_return = [result['abstract']['value'] for result in abstract_list if result['abstract']['xml:lang']=='en'][0]
    except:
        to_return = []
    return to_return

def get_dbpedia_philosophers(endpoint="http://localhost:8890/sparql"):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery("""
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                    SELECT ?person
                    WHERE { ?person a dbo:Philosopher}
                    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    dbpedia_philosophers = [result['person']['value'].split('/')[-1] for result in results['results']['bindings']]
    return dbpedia_philosophers

def get_wikipedia_philosophers(list_url, n_letters):
    response = request.urlopen(list_url)
    html = response.read()
    soup  = BeautifulSoup(html, 'html.parser')
    philosopher_urls = [thing.select('a[href*="wiki/"]') for thing in soup.find_all('ul')[1:n_letters]]
    philosopher_urls = chain(*philosopher_urls)
    philosopher_urls = [thing['href'] for thing in philosopher_urls]
    return philosopher_urls
def get_wiki_article(url):
    try:
        return wikipedia.page(url).content
    except:
        return -1

In [87]:
%%time
philosophers_a_c = 'https://en.wikipedia.org/wiki/List_of_philosophers_(A%E2%80%93C)'
philosophers_d_h = 'https://en.wikipedia.org/wiki/List_of_philosophers_(D%E2%80%93H)'
philosophers_i_q = 'https://en.wikipedia.org/wiki/List_of_philosophers_(I%E2%80%93Q)'
philosophers_r_z = 'https://en.wikipedia.org/wiki/List_of_philosophers_(R%E2%80%93Z)'

list_url = [philosophers_a_c, philosophers_d_h, philosophers_i_q, philosophers_r_z]
n_letters = [4, 6, 10, 10]

wikipedia_urls = [get_wikipedia_philosophers(list_url=a, n_letters=b) for a,b in zip(list_url, n_letters)]
wikipedia_urls = [url.split('/wiki/')[-1] for url in chain(*wikipedia_urls)]
dbpedia_urls = get_dbpedia_philosophers()

CPU times: user 999 ms, sys: 13.3 ms, total: 1.01 s
Wall time: 1.83 s


In [88]:
len(wikipedia_urls)

1734

In [89]:
len(dbpedia_urls)

5123

In [103]:
philosopher_urls = list(set(wikipedia_urls) | set(dbpedia_urls))
len(philosopher_urls)

6089

In [160]:
dataset = pd.DataFrame(pd.Series(philosopher_urls), columns=['philosopher_url'])
dataset

Unnamed: 0,philosopher_url
0,Stephen_Law
1,Henry_S._Richardson
2,John_Amos_Comenius
3,Javier_Gomá
4,Oskar_Negt
...,...
6084,Stanisław_Krajewski
6085,Patrick_Stokes_(philosopher)
6086,Ernst_Mach
6087,Jessica_Pierce


In [192]:
%%time
dataset['name'] = dataset['philosopher_url'].apply(get_name)
dataset['abstract'] = dataset['philosopher_url'].apply(get_abstract)
dataset['abstract'] = dataset['abstract'].apply(return_en_result)
dataset['notable_ideas'] = dataset['philosopher_url'].apply(get_notable_ideas)
dataset['influence_inbound'] = dataset['philosopher_url'].apply(philosopher_influenced_by) # Influenced me
dataset['influence_outbound'] = dataset['philosopher_url'].apply(philosopher_influenced) # I influenced
dataset['undirected_influence'] = dataset.apply(lambda x: list(set(x['influence_inbound']+x['influence_outbound'])),axis=1)
dataset = dataset.applymap(lambda x: np.nan if len(x)==0 else x)
dataset

CPU times: user 1min 11s, sys: 11.8 s, total: 1min 23s
Wall time: 3min 18s


Unnamed: 0,philosopher_url,name,abstract,notable_ideas,influence_inbound,influence_outbound,undirected_influence
0,Stephen_Law,[Stephen Law],Stephen Law (born 1960) is an English philoso...,,,,
1,Henry_S._Richardson,[Henry S. Richardson],Henry S. Richardson is an American philosopher...,,[John_Rawls],,[John_Rawls]
2,John_Amos_Comenius,"[John Amos Comenius, Johann Amos Comenius]",John Amos Comenius (Czech: Jan Amos Komenský; ...,,,,
3,Javier_Gomá,[Javier Gomá Lanzón],"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ...",,,,
4,Oskar_Negt,,Oskar Negt (German pronunciation: [ˈneːkt]; bo...,,,,
...,...,...,...,...,...,...,...
6084,Stanisław_Krajewski,,Stanisław Krajewski (born 1950) is a Polish ph...,,,,
6085,Patrick_Stokes_(philosopher),[Patrick Stokes],Patrick Stokes (born 1978) is an Australian ph...,,,,
6086,Ernst_Mach,[Ernst Mach],Ernst Waldfried Josef Wenzel Mach (; German: [...,,"[Gustav_Fechner, George_Berkeley, Andreas_von_...","[Pierre_Duhem, Henri_Poincaré, Ludwig_Boltzman...","[William_James, Friedrich_Hayek, Albert_Einste..."
6087,Jessica_Pierce,[Jessica Pierce],"Jessica Pierce (born October 21, 1965) is an A...",,,,


Unnamed: 0,philosopher_url,name,abstract,notable_ideas,influence_inbound,influence_outbound
0,Stephen_Law,[Stephen Law],Stephen Law (born 1960) is an English philoso...,,,
1,Henry_S._Richardson,[Henry S. Richardson],Henry S. Richardson is an American philosopher...,,[John_Rawls],
2,John_Amos_Comenius,"[John Amos Comenius, Johann Amos Comenius]",John Amos Comenius (Czech: Jan Amos Komenský; ...,,,
3,Javier_Gomá,[Javier Gomá Lanzón],"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ...",,,
4,Oskar_Negt,,Oskar Negt (German pronunciation: [ˈneːkt]; bo...,,,
...,...,...,...,...,...,...
6084,Stanisław_Krajewski,,Stanisław Krajewski (born 1950) is a Polish ph...,,,
6085,Patrick_Stokes_(philosopher),[Patrick Stokes],Patrick Stokes (born 1978) is an Australian ph...,,,
6086,Ernst_Mach,[Ernst Mach],Ernst Waldfried Josef Wenzel Mach (; German: [...,,"[Gustav_Fechner, George_Berkeley, Andreas_von_...","[Pierre_Duhem, Henri_Poincaré, Ludwig_Boltzman..."
6087,Jessica_Pierce,[Jessica Pierce],"Jessica Pierce (born October 21, 1965) is an A...",,,


In [170]:
dataset.to_csv('../data/wiki_dbpedia_philosophers.csv')

In [174]:
g = nx.DiGraph()

In [185]:
dataset[['philosopher_url', 'influence_outbound']].dropna().set_index('philosopher_url').to_dict()

{'influence_outbound': {'Gillian_Rose': ['Rowan_Williams',
   'Geoffrey_Hill',
   'Paul_Gilroy',
   'John_Milbank',
   'Slavoj_Žižek'],
  'Alexander_Zinoviev': ['Jon_Elster',
   'Boris_Grushin',
   'Yuri_Levada',
   'Georgy_Shchedrovitsky'],
  'Sextus_of_Chaeronea': ['Apuleius',
   'Herodes_Atticus',
   'Marcus_Aurelius',
   'Himerius'],
  'Alexander_Potebnja': ['Lev_Vygotsky'],
  'Antonio_Labriola': ['Benedetto_Croce',
   'Georges_Sorel',
   'Amadeo_Bordiga',
   'Leon_Trotsky',
   'Antonio_Gramsci'],
  'Manuel_DeLanda': ['Levi_Bryant'],
  'René_Guénon': ['Aleksandr_Dugin',
   'Carl_Schmitt',
   'Francis_Parker_Yockey',
   'Jean_Hani',
   'Philip_Sherrard',
   'Michel_Valsan',
   'Ali_Lakhani',
   'Seyyed_Hossein_Nasr',
   'Arturo_Reghini',
   'Frithjof_Schuon',
   'Ramon_Mujica_Pinilla',
   'Harry_Oldmeadow',
   'Seraphim_Rose',
   'Wolfgang_Smith',
   'Hamza_Yusuf',
   'Gary_Snyder',
   'Bernard_Philip_Kelly',
   'Whitall_Perry',
   'Hossein_Nasr',
   'Valentin_Tomberg',
   'Jean_Bor

In [205]:
dataset.to_pickle('../data/dbpedia.pkl')

In [4]:
dataset = pd.read_pickle('../data/dbpedia.pkl')

In [7]:
wikipedia = MediaWiki()

In [8]:
wikipedia.page('Ludwig_Wittgenstein').content



In [15]:
%%time
dataset['text'] = dataset['philosopher_url'].apply(get_wiki_article)

CPU times: user 1min 40s, sys: 4.04 s, total: 1min 44s
Wall time: 2h 9min 38s


In [16]:
dataset

Unnamed: 0,philosopher_url,name,abstract,notable_ideas,influence_inbound,influence_outbound,undirected_influence,text
0,Stephen_Law,[Stephen Law],Stephen Law (born 1960) is an English philoso...,,,,,Stephen Law (born 1960) is an English philoso...
1,Henry_S._Richardson,[Henry S. Richardson],Henry S. Richardson is an American philosopher...,,[John_Rawls],,[John_Rawls],Henry S. Richardson is an American philosopher...
2,John_Amos_Comenius,"[John Amos Comenius, Johann Amos Comenius]",John Amos Comenius (Czech: Jan Amos Komenský; ...,,,,,John Amos Comenius (Czech: Jan Amos Komenský; ...
3,Javier_Gomá,[Javier Gomá Lanzón],"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ...",,,,,"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ..."
4,Oskar_Negt,,Oskar Negt (German pronunciation: [ˈneːkt]; bo...,,,,,Oskar Negt (German pronunciation: [ˈneːkt]; bo...
...,...,...,...,...,...,...,...,...
6084,Stanisław_Krajewski,,Stanisław Krajewski (born 1950) is a Polish ph...,,,,,Stanisław Krajewski (born 1950) is a Polish ph...
6085,Patrick_Stokes_(philosopher),[Patrick Stokes],Patrick Stokes (born 1978) is an Australian ph...,,,,,Patrick Stokes (born 1978) is an Australian ph...
6086,Ernst_Mach,[Ernst Mach],Ernst Waldfried Josef Wenzel Mach (; German: [...,,"[Gustav_Fechner, George_Berkeley, Andreas_von_...","[Pierre_Duhem, Henri_Poincaré, Ludwig_Boltzman...","[William_James, Friedrich_Hayek, Albert_Einste...",Ernst Waldfried Josef Wenzel Mach (; German: [...
6087,Jessica_Pierce,[Jessica Pierce],"Jessica Pierce (born October 21, 1965) is an A...",,,,,"Jessica Pierce (born October 21, 1965) is an A..."


In [17]:
dataset.to_pickle('../data/dbpedia_with_articles.pkl')

In [19]:
dataset[dataset['text'] == -1]

Unnamed: 0,philosopher_url,name,abstract,notable_ideas,influence_inbound,influence_outbound,undirected_influence,text
7,Petar_II_Petrovi%C4%87-Njego%C5%A1,,,,,,,-1
27,Samuel_Butler_(1835%E2%80%931902),,,,,,,-1
78,Jayar%C4%81%C5%9Bi_Bha%E1%B9%AD%E1%B9%ADa,,,,,,,-1
110,Ra%C3%BAl_Scalabrini_Ortiz,,,,,,,-1
121,Jakob_B%C3%B6hme,,,,,,,-1
...,...,...,...,...,...,...,...,...
5644,Alan_H._Goldman,[Alan H. Goldman],Alan Harris Goldman (born 1945) is an American...,,,,,-1
5709,%C5%9A%C4%81ntarak%E1%B9%A3ita,,,,,,,-1
5746,David_Sherry,,,,,,,-1
5789,Adolfo_S%C3%A1nchez_V%C3%A1zquez,,,,,,,-1
