Wikipedia API documentation:
https://pypi.org/project/Wikipedia-API/

In [1]:
import wikipediaapi
import pandas as pd

# Getting all pages from the selected category

In [2]:
wiki_wiki = wikipediaapi.Wikipedia('en')

In [3]:
# Example: getting all pages under the category 'Activism'
cat = wiki_wiki.page("Category:Activism")

for c in cat.categorymembers:
    print(c)

Activism
Activism at Ohio Wesleyan University
Activism industry
Activist ageing
Activist knowledge
Advocacy group
All Nations Café
Anarcho-punk
Artivism
Athlete activism in the United States
Back-story (production)
Brand activism
Buycott.com
CAVE people
The Cigarette Papers
Citizen's Charter and Grievance Redressal Bill, 2011
Civil libertarianism
Civil society campaign
Cognitive activism
Community House (Salt River, Cape Town)
Constructive Program
Consumer activism
Cordobazo
Counter-recruitment
Data activism
Designated Suppliers Program
Direct Action and Democracy Today
Disability justice
DIY ethic
Dual power
Elder rights
Fan activism
Fashion activism
Free Your Mind (MTV award)
Ghost shoes
Global citizens movement
Global Day of Action
GlobaLeaks
Google Earth Outreach
Grassroots
Human shield action to Iraq
Hyde Park, Sydney
Institute for Global Communications
Jail solidarity
Justice tourism
Law collective
Miami model
Milieudefensie
Militant
The Naked Society
Nationalist activism
Nonviol

In [4]:
# Getting all pages in the selected categories

def get_pages_from_category(category):
    output = []
    cat = wiki_wiki.page(category)
    for page in cat.categorymembers:
        output.append(page)
    return output


sample = get_pages_from_category('Category:Activism')
print(sample)

print("")
print('There are %d elements in the list' % len(sample))

['Activism', 'Activism at Ohio Wesleyan University', 'Activism industry', 'Activist ageing', 'Activist knowledge', 'Advocacy group', 'All Nations Café', 'Anarcho-punk', 'Artivism', 'Athlete activism in the United States', 'Back-story (production)', 'Brand activism', 'Buycott.com', 'CAVE people', 'The Cigarette Papers', "Citizen's Charter and Grievance Redressal Bill, 2011", 'Civil libertarianism', 'Civil society campaign', 'Cognitive activism', 'Community House (Salt River, Cape Town)', 'Constructive Program', 'Consumer activism', 'Cordobazo', 'Counter-recruitment', 'Data activism', 'Designated Suppliers Program', 'Direct Action and Democracy Today', 'Disability justice', 'DIY ethic', 'Dual power', 'Elder rights', 'Fan activism', 'Fashion activism', 'Free Your Mind (MTV award)', 'Ghost shoes', 'Global citizens movement', 'Global Day of Action', 'GlobaLeaks', 'Google Earth Outreach', 'Grassroots', 'Human shield action to Iraq', 'Hyde Park, Sydney', 'Institute for Global Communications',

Part of the result is composed of other sub-categories instead of pages. We can re-use the function and apply it to a list of elements.

In [5]:
def get_pages_from_list(list_of_elements):
    output = []
    for c in list_of_elements:
        if c.startswith('Category:'):
            x = get_pages_from_category(c)
            for page in x:
                output.append(page) # We save the category
    return output

In [6]:
sample2 = get_pages_from_list(sample)

print("")
print('There are %d elements in the list' % len(sample2))


There are 904 elements in the list


In [7]:
sample3 = get_pages_from_list(sample2)

print("")
print('There are %d elements in the list' % len(sample3))


There are 7298 elements in the list


# Creating the dataset

For each page we want to store the content, the category and all the pages it is linked to.

In [8]:
def create_category(category):
    cat = 'Category:' + category
    output = []

    # Getting several levels of categories
    level1 = get_pages_from_category(cat)
    level2 = get_pages_from_list(level1)
    #level3 = get_pages_from_list(level2)

    result = []
    # Removing remaining categories
    for e in level2:
        if (e.startswith('Category:') == False) and (e.startswith('Template:') == False):
            result.append(e)

    # Appending every element to the output, adding the category
    for el in result:
        output.append([el,category])

    return output

+ Society & Social Sciences
+ Society
+ Activism, Business, Communication, Crime, Education, Ethnic groups, Family, Finance, Globalization, Government, Health, Home, Industries, Infrastructure, Law, Mass media, Military, Money, Organizations, Peace, Politics, Real estate, Rights, War

In [9]:
education = create_category('Education')

print('There are %d elements in the list' % len(education))

There are 2609 elements in the list


In [10]:
ethnic = create_category('Ethnic groups')
globa = create_category('Globalization')
gov = create_category('Government')
politics = create_category('Politics')
war = create_category('War')
peace = create_category('Peace')
mil = create_category('Military')
act = create_category('Activism')
rights = create_category('Rights')
fin = create_category('Finance')
mm = create_category('Mass media')
crime = create_category('Crime')
fam = create_category('Family')

In [11]:
data = education+ethnic+globa+gov+politics+war+peace+mil+act+rights+fin+mm+crime+fam

In [12]:
df = pd.DataFrame(data, columns=['Page','Category'])

df

Unnamed: 0,Page,Category
0,Field trip,Education
1,Education in emergencies and conflict areas,Education
2,Ability grouping,Education
3,Classwide Peer Tutoring,Education
4,Field trip,Education
...,...,...
16628,The Unprocessed Child,Family
16629,Virtual twin,Family
16630,Virtual visitation,Family
16631,"Where Are We Going, Dad? (film)",Family


## Adding the column 'Links' 

In [13]:
linked_pages = []

for p in df['Page']:
    page = wiki_wiki.page(p)
    links = page.links
    links =  [e for e in links.keys()]
    linked_pages.append(links)

In [14]:
df['Links'] = linked_pages

df

Unnamed: 0,Page,Category,Links
0,Field trip,Education,[American Association of School Administrators...
1,Education in emergencies and conflict areas,Education,"[Adult, Armed conflict, Asylum seeker, Banglad..."
2,Ability grouping,Education,"[Academic achievement, Classroom, Cluster grou..."
3,Classwide Peer Tutoring,Education,"[ADHD, Active learning, Cardio-pulmonary resus..."
4,Field trip,Education,[American Association of School Administrators...
...,...,...,...
16628,The Unprocessed Child,Family,"[A.S. Neill, Attachment parenting, ISBN (ident..."
16629,Virtual twin,Family,"[California State University, Fullerton, Digit..."
16630,Virtual visitation,Family,"[Chicago Tribune, Child custody, Divorces, E-m..."
16631,"Where Are We Going, Dad? (film)",Family,"[BBC, Beijing Enlight Pictures, Film, Film Bus..."


## Adding the column 'Text'

In [16]:
texts = []

for p in df['Page']:
    page = wiki_wiki.page(p)
    result = []
    result.append(page.text)
    texts.append(result)

df['Text'] = texts

In [17]:
df

Unnamed: 0,Page,Category,Links,Text
0,Field trip,Education,[American Association of School Administrators...,[A field trip or excursion is a journey by a g...
1,Education in emergencies and conflict areas,Education,"[Adult, Armed conflict, Asylum seeker, Banglad...",[Education in emergencies and conflict areas i...
2,Ability grouping,Education,"[Academic achievement, Classroom, Cluster grou...",[Ability grouping is the educational practice ...
3,Classwide Peer Tutoring,Education,"[ADHD, Active learning, Cardio-pulmonary resus...",[Classwide Peer Tutoring (CWPT) is a variation...
4,Field trip,Education,[American Association of School Administrators...,[A field trip or excursion is a journey by a g...
...,...,...,...,...
16628,The Unprocessed Child,Family,"[A.S. Neill, Attachment parenting, ISBN (ident...",[The Unprocessed Child: Living Without School ...
16629,Virtual twin,Family,"[California State University, Fullerton, Digit...",[Virtual twins are two children who are genera...
16630,Virtual visitation,Family,"[Chicago Tribune, Child custody, Divorces, E-m...",[Virtual visitation is the use of electronic c...
16631,"Where Are We Going, Dad? (film)",Family,"[BBC, Beijing Enlight Pictures, Film, Film Bus...","[Where Are We Going, Dad? (Chinese: 爸爸去哪儿) is ..."


# Controllo duplicati

In [18]:
df[df.duplicated(['Page'])]

Unnamed: 0,Page,Category,Links,Text
4,Field trip,Education,[American Association of School Administrators...,[A field trip or excursion is a journey by a g...
161,Peer learning,Education,"[Action learning, Andragogy, Autodidactism, Bi...",[One of the most visible approaches to peer le...
224,Classwide Peer Tutoring,Education,"[ADHD, Active learning, Cardio-pulmonary resus...",[Classwide Peer Tutoring (CWPT) is a variation...
231,The First-Year Experience Program,Education,"[Acadia National Park, Appreciative advising, ...",[The First-Year Experience (FYE) (also known a...
233,Gamification of learning,Education,"[Affordance, Amy Jo Kim, Avatar (computing), B...",[The gamification of learning is an educationa...
...,...,...,...,...
16621,Split custody,Family,"[Adoption, Adultery, After-school activity, Al...",[Split custody refers to a child custody arran...
16623,Stepfather,Family,"[Adoption, Affinity (law), Agape, Ahnentafel, ...",[A stepfather or stepdad is a non-biological m...
16624,Stepsibling,Family,"[Adoption, Affinity (law), Agape, Ahnentafel, ...",[Step-siblings are children born of two differ...
16625,Strict father model,Family,"[Adoption, After-school activity, Alan E. Kazd...",[The strict father model of parenting is one w...


In [22]:
df = df.drop_duplicates(subset='Page')

In [23]:
df

Unnamed: 0,Page,Category,Links,Text
0,Field trip,Education,[American Association of School Administrators...,[A field trip or excursion is a journey by a g...
1,Education in emergencies and conflict areas,Education,"[Adult, Armed conflict, Asylum seeker, Banglad...",[Education in emergencies and conflict areas i...
2,Ability grouping,Education,"[Academic achievement, Classroom, Cluster grou...",[Ability grouping is the educational practice ...
3,Classwide Peer Tutoring,Education,"[ADHD, Active learning, Cardio-pulmonary resus...",[Classwide Peer Tutoring (CWPT) is a variation...
5,Flexible learning,Education,"[Australia, Blended learning, Distance educati...",[Flexible learning is a principle of practice ...
...,...,...,...,...
16628,The Unprocessed Child,Family,"[A.S. Neill, Attachment parenting, ISBN (ident...",[The Unprocessed Child: Living Without School ...
16629,Virtual twin,Family,"[California State University, Fullerton, Digit...",[Virtual twins are two children who are genera...
16630,Virtual visitation,Family,"[Chicago Tribune, Child custody, Divorces, E-m...",[Virtual visitation is the use of electronic c...
16631,"Where Are We Going, Dad? (film)",Family,"[BBC, Beijing Enlight Pictures, Film, Film Bus...","[Where Are We Going, Dad? (Chinese: 爸爸去哪儿) is ..."


In [24]:
df.to_csv('./data/dataset.csv')