In [35]:
import wikipedia
import re
import pandas as pd
import time

First we get csv files of the wikipedia page titles from 5 diffirent categories using this <a href = "https://petscan.wmflabs.org">api</a> 

The 5 categories used are:
    -  Politics
    -  Sports
    -  History
    -  Culture
    -  Computer science

In [8]:
politics = pd.read_csv("Politics.csv")
sports = pd.read_csv("Sports.csv")
history = pd.read_csv("History.csv")
culture = pd.read_csv("Culture.csv")
comp_science = pd.read_csv("Computer_science.csv")

In [10]:
politics.head()

Unnamed: 0,number,title,pageid,namespace,length,touched
0,1,Antisemitism,1078,,211152,20180324171338
1,2,Crony_capitalism,5249,,38003,20180305205053
2,3,Classical_liberalism,6677,,51315,20180321212302
3,4,Citizenship,6784,,57974,20180323010834
4,5,Corporatocracy,6997,,29355,20180311201059


## Removing extra columns

In [11]:
politics.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
sports.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
history.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
culture.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)
comp_science.drop(['number', 'namespace', 'touched'], axis = 1, inplace=True)

In [12]:
politics.head()

Unnamed: 0,title,pageid,length
0,Antisemitism,1078,211152
1,Crony_capitalism,5249,38003
2,Classical_liberalism,6677,51315
3,Citizenship,6784,57974
4,Corporatocracy,6997,29355


## Reduce each category to 1000 docs

In [91]:
N = 50
politics = politics.sort_values('length', ascending=False).head(N).reset_index().drop('index', axis=1)
sports = sports.sort_values('length', ascending=False).head(N).reset_index().drop('index', axis=1)
history = history.sort_values('length', ascending=False).head(N).reset_index().drop('index', axis=1)
culture = culture.sort_values('length', ascending=False).head(N).reset_index().drop('index', axis=1)
comp_science = comp_science.sort_values('length', ascending=False).head(N).reset_index().drop('index', axis=1)

In [92]:
comp_science.head()

Unnamed: 0,title,pageid,length
0,Artificial intelligence,1164,231620
1,Comparison of programming languages (string fu...,3681422,109570
2,Geographic information system,12398,77692
3,Computational creativity,16300571,61153
4,Computational phylogenetics,3986130,58742


In [93]:
comp_science.shape

(50, 3)

In [94]:
def correct_title(category):
    category['title'] = category['title'].map(lambda x: re.sub('_', ' ', x))

In [95]:
correct_title(politics)
correct_title(sports)
correct_title(history)
correct_title(culture)
correct_title(comp_science)

## Adding the content column

In [96]:
def add_contents(category):
    content = []
    counter = 0
    for title in list(category['title']):
        counter += 1
        p = wikipedia.page(title)
        content.append(p.content)
        if counter == 10:
            time.sleep(20)
            counter = 0
    category['content'] = content

In [110]:
add_contents(comp_science)

In [112]:
politics.to_csv("50_sample_politics.csv")
sports.to_csv("50_sample_sports.csv")
history.to_csv("50_sample_history.csv")
culture.to_csv("50_sample_culture.csv")
comp_science.to_csv("50_sample_comp_science.csv")

In [114]:
politics

Unnamed: 0,title,pageid,length,content
0,Newspaper endorsements in the United States pr...,51202493,523778,Various notable newspapers made endorsements o...
1,Timeline of 1960s counterculture,45684607,421412,The following is a chronological capsule histo...
2,Antisemitism,1078,211152,Antisemitism (also spelled anti-Semitism or an...
3,Libertarianism,3225498,172434,"Libertarianism (from Latin: libertas, meaning ..."
4,"List of scandals with \-gate\"" suffix""",243541,163074,This is a list of scandals or controversies wh...
5,History of terrorism,6999780,154272,The history of terrorism is a history of well-...
6,Arguments for and against drug prohibition,1181942,150880,"Arguments about the prohibition of drugs, and ..."
7,List of political catchphrases,7488151,135800,The following is a list of political catchphra...
8,Neo-Nazism,54361,133006,Neo-Nazism consists of post-World War II milit...
9,Criticism of Confucius Institutes,32362385,127492,"The Confucius Institute (CI) program, which be..."


In [76]:
p.url

'https://en.wikipedia.org/wiki/List_of_scandals_with_%22-gate%22_suffix'

In [66]:
politics.head()

Unnamed: 0,title,pageid,length
0,Newspaper_endorsements_in_the_United_States_pr...,51202493,523778
1,Timeline_of_1960s_counterculture,45684607,421412
2,Antisemitism,1078,211152
3,Libertarianism,3225498,172434
4,"List_of_scandals_with_\-gate\""_suffix""",243541,163074
