In [1]:
import wikipedia
import re
import pandas as pd
import time

First we get csv files of the wikipedia page titles from 5 diffirent categories using this <a href = "https://petscan.wmflabs.org">api</a> 

The 5 categories used are:
    -  Politics
    -  Sports
    -  History
    -  Culture
    -  Computer science

I have got downloaded the table of wikipedia articles in each of the categories from <a href = "https://petscan.wmflabs.org/">this</a> api

In [2]:
politics = pd.read_csv("Politics.csv")
sports = pd.read_csv("Sports.csv")
history = pd.read_csv("History.csv")
culture = pd.read_csv("Culture.csv")
comp_science = pd.read_csv("Computer_science.csv")

In [3]:
politics.head()

Unnamed: 0,number,title,pageid,namespace,length,touched
0,1,Antisemitism,1078,,211152,20180324171338
1,2,Crony_capitalism,5249,,38003,20180305205053
2,3,Classical_liberalism,6677,,51315,20180321212302
3,4,Citizenship,6784,,57974,20180323010834
4,5,Corporatocracy,6997,,29355,20180311201059


## Removing extra columns

In [4]:
politics.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
sports.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
history.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
culture.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
comp_science.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)

In [5]:
politics.head()

Unnamed: 0,title,pageid,length
0,Antisemitism,1078,211152
1,Crony_capitalism,5249,38003
2,Classical_liberalism,6677,51315
3,Citizenship,6784,57974
4,Corporatocracy,6997,29355


In [6]:
# N = 1000
politics = politics.sort_values('length', ascending=False).reset_index().drop('index', axis=1)
sports = sports.sort_values('length', ascending=False).reset_index().drop('index', axis=1)
history = history.sort_values('length', ascending=False).reset_index().drop('index', axis=1)
culture = culture.sort_values('length', ascending=False).reset_index().drop('index', axis=1)
comp_science = comp_science.sort_values('length', ascending=False).reset_index().drop('index', axis=1)

In [7]:
comp_science.head()

Unnamed: 0,title,pageid,length
0,Artificial_intelligence,1164,231620
1,Comparison_of_programming_languages_(string_fu...,3681422,109570
2,Geographic_information_system,12398,77692
3,Computational_creativity,16300571,61153
4,Computational_phylogenetics,3986130,58742


In [8]:
comp_science.shape

(1171, 3)

In [9]:
#replacing underlines with space in the title. This is compatible with the format of the titles in wikimedia api
def correct_title(category):
    category['title'] = category['title'].map(lambda x: re.sub('_', ' ', x))

In [10]:
correct_title(politics)
correct_title(sports)
correct_title(history)
correct_title(culture)
correct_title(comp_science)

## Adding the content column
For adding the raw content of each page I will use <a href = "https://pypi.python.org/pypi/wikipedia">wikipedia</a> library. This library uses the <a href = "https://www.mediawiki.org/wiki/API:Main_page">wikimedia</a> api directly and by querying the page title it will return the raw content of the page.

In [19]:
def add_contents(category):
    content = []
    indicies = []
    counter = 0
    ind = 0
    while len(content) < 1000 or ind >= len(list(category['title'])):
        counter += 1
        title = list(category['title'])[ind]
        try:
            p = wikipedia.page(title)
            content.append(p.content)
            indicies.append(ind)
            ind += 1
            if counter == 10:
                time.sleep(10)
                counter = 0
                print(ind)
        except wikipedia.exceptions.PageError: 
            ind += 1
        except wikipedia.exceptions.DisambiguationError:
            ind += 1
        except wikipedia.exceptions.RedirectError:
            ind += 1
        except Exception:
            time.sleep(10)
    result = category.loc[indicies]
    result['content'] = content
    return result

Since getting 1000 pages for each category takes some time (this is due to the exceptions that the api throws, for instance when the connection is lost, etc), this part has been done in another script (get_wiki_content.py)