In [1]:
import pandas as pd
import os
import requests
import mwparserfromhell

In [2]:
### Main I/O control variables
# "in_needed_pages": the csv containing the links to the pages or the page names
in_needed_pages = 'demo_pages_in_wars_1945-1989.csv'
# "link" set True if "in_needed_pages" contains links
link = True
# The csv to write the analysis result into
out_analysis = 'demo_analysis_result.csv'
# The csv for the results
out_results = 'demo_templates_result_wars_1945-1989.csv'

### Template control variables
# "ll_templates" and "ll_variables" need to be the same length
# "ll_templates" is a list of lists, the list of templates is useful if you expect more templates having the same set of variables
# When a template is found with at least one variable that is needed, the search stops (i.e. no further templates are checked in the list)
# All the elements of the list (of the lists) are looped through, this way you can get multiple types of info from each page
ll_templates = [['Age in years, months, weeks and days', 'Age in months, weeks and days', 'Start date and age'],
                ['Infobox military conflict', 'Infobox civil conflict']]
# "ll_variables" is the corresponding list of lists containing the variables expected within each of the templates
# It needs to be in the same order as "ll_templates"
ll_variables = [['month1', 'day1', 'year1', 'month2', 'day2', 'year2'],
                ['date', 'dates', 'conflict', 'place', 'result', 'combatant1', 'combatant2', 'strength1', 'strength2', 'casualties1', 'casualties2']]

### Additional control variables
# Input directory, e.g. './Input'
input_dir = './Input'
# Output directory, e.g. './Output'
output_dir = './Output'
# The directory for the downloaded wikitexts, e.g. './Wikitexts'
wikitexts_dir = './Wikitexts'
# "pages_limit" allows for limiting the pages used for analysis or data extraction, useful for testing settings.
# If not needed, select a necessarily high number
pages_limit = 999999
# These are the characters in the link removed for the filename containing the wikitexts
replace_chars = ':/.=&?%'

In [3]:
# FUNCTION DEFINITION: Downloads missing pages

def download_missing_pages(df_needed, list_have, links=True):
    if links:
        for index, row in df_needed.iterrows():
            filename = row[0]
            for char in replace_chars:
                filename = filename.replace(char, '')
            if filename + '.wiki' in list_have:
                print('Found: ' + filename, end=' - ')
            else:
                try:
                    page = requests.get(row[0] + '?action=raw')
                    wikitext = page.text
                    file = open(wikitexts_dir + '/' + filename + '.wiki', 'w')
                    file.write(wikitext)
                    file.close()
                    print('Downloaded: ' + filename, end=' - ')
                except:
                    print('\n*** Failed to download: ' + filename, end=' ***\n')
    else:
        for index, row in df_needed.iterrows():
            if row[0].replace(' ', '_') + '.wiki' in list_have:
                print('Found: ' + row[0], end=' - ')
            else:
                try:
                    page = requests.get('https://en.wikipedia.org/wiki/' + row[0].replace(' ', '_') + '?action=raw')
                    wikitext = page.text
                    file = open(wikitexts_dir + '/' + row[0].replace(' ', '_') + '.wiki', 'w')
                    file.write(wikitext)
                    file.close()
                    print('Downloaded: ' + row[0], end=' - ')
                except:
                    print('\n*** Failed to download: ' + row[0], end=' ***\n')

In [4]:
# FUNCTION DEFINITION: Gets all templates found in a Wikitext and returns them in a dictionary

def get_all_templates(wikitext):
    d_templates_found = dict()
    templates = mwparserfromhell.parse(wikitext).filter_templates()
    
    for template in templates:
        if str(template.name) in d_templates_found:
            i = 1
            while str(template.name) + '_' + str(i) in d_templates_found:
                i += 1
            d_templates_found[str(template.name) + '_' + str(i)] = template.params
        else:
            d_templates_found[str(template.name)] = template.params
    
    return d_templates_found

In [5]:
# FUNCTION DEFINITION: Returns the parameters for the first occurence of the requested template in the Wikitext

def dig_template(wikitext, template_name):
    templates = mwparserfromhell.parse(wikitext).filter_templates()
    for template in templates:
        if template.name.matches(template_name):
            return template.params

In [6]:
# FUNCTION DEFINITION: Gets all variables from wikitext using a list of templates (useful if you expect more templates having the same set of variables)
# When a template is found with at least one variable that is needed, the search stops (i.e. no further templates are checked in the list)

def variables_from_templates(wikitext, l_templates, l_variables):
    d_variables_found = dict()
    found = False

    for template in l_templates:
        if found:
            break
        try:
            for param in dig_template(wikitext, template):
                for var in l_variables:
                    if param.name == var:
                        d_variables_found[var] = param.value.strip_code()
                        found = True
        except:
            continue

    return d_variables_found

In [7]:
### PART 1: DOWNLOAD NEEDED PAGES ###
### Run this part only if new pages were added to the list and they need to be downloaded ###
# Reads in the list of pages needed, downloads the ones not present in the directory selected for the Wikitexts
# TODO: choice of page name and page link (diff var name in input csv)

df_needed = pd.read_csv(input_dir + '/' + in_needed_pages, header=None)
list_have = os.listdir(wikitexts_dir)

download_missing_pages(df_needed, list_have, link)

Found: httpsenwikipediaorgwikiCrusaders_(guerrilla) - Found: httpsenwikipediaorgwindexphptitle1945_Khuzestan_revoltactioneditredlink1 - Found: httpsenwikipediaorgwindexphptitleSafi_Rebellionactioneditredlink1 - Found: httpsenwikipediaorgwiki1945_Hazara_Rebellion - Found: httpsenwikipediaorgwikiKorean_conflict - Found: httpsenwikipediaorgwikiIndonesian_National_Revolution - Found: httpsenwikipediaorgwikiIran_crisis_of_1946 - Found: httpsenwikipediaorgwikiHukbalahap_rebellion - Found: httpsenwikipediaorgwikiAutumn_Uprising_of_1946 - Found: httpsenwikipediaorgwikiPunnapra-Vayalar_uprising - Found: httpsenwikipediaorgwikiFirst_Indochina_War - Found: httpsenwikipediaorgwikiGreek_Civil_War - Found: httpsenwikipediaorgwikiParaguayan_Civil_War_(1947) - Found: httpsenwikipediaorgwiki1947_Poonch_Rebellion - Found: httpsenwikipediaorgwikiIntegration_of_Junagadh - Found: httpsenwikipediaorgwikiRomanian_anti-communist_resistance_movement - Found: httpsenwikipediaorgwikiIndo-Pakistani_War_of_1947 - 

In [42]:
### PART 2: GET TEMPLATES FROM PAGES ###
### This part does the actual template extraction from the pages. ###
# Gets the templates from the pages based on the template and variable settings
# The results can be written out into a csv file

df_needed = pd.read_csv(input_dir + '/' + in_needed_pages, header=None)
df_results = pd.DataFrame()

for index, row in df_needed[:pages_limit].iterrows():    
    try:
        if link:
            filename = row[0]
            for char in replace_chars:
                filename = filename.replace(char, '')
            file = open(wikitexts_dir + '/' + filename +  '.wiki', 'r')
        else:
            file = open(wikitexts_dir + '/' + row[0].replace(' ', '_') +  '.wiki', 'r')
        wikitext = file.read()
    except:
        print("File open error: " + row[0])
        continue
    d_vars_f_temps = {'Source_Name': row[0]}
    for l_templates, l_variables in zip(ll_templates, ll_variables):
        d_vars_f_temps.update(variables_from_templates(wikitext, l_templates, l_variables))
    df_results = df_results.append(d_vars_f_temps, ignore_index=True)

In [45]:
columns_for_clearing_nas = list(df_results.columns)
columns_for_clearing_nas.remove('Source_Name')
df_results.dropna(how='all', axis='index', subset=columns_for_clearing).head()

Unnamed: 0,Source_Name,day1,day2,month1,month2,year1,year2,casualties1,casualties2,combatant1,combatant2,conflict,date,place,result,strength1,strength2
10,https://en.wikipedia.org/wiki/First_Indochina_War,19.0,1.0,12.0,8.0,1946.0,1954.0,,,,,,,,,,
11,https://en.wikipedia.org/wiki/Greek_Civil_War,,,,,1943.0,1944.0,,,,,,,,,,
17,https://en.wikipedia.org/wiki/Malagasy_Uprising,,,,,,,590 French soldiers killed,Unknown,French colonial empire,Malagasy secret societies\nVy Vato Sakelika\nJ...,Malagasy Uprising,29 March 1947 – February 1949,Madagascar,"Uprising crushed by French forces, various Ma...","18,000 30,000","initially 2,000, later hundreds of thousands"
20,https://en.wikipedia.org/wiki/Costa_Rican_Civi...,12.0,24.0,3.0,4.0,1948.0,1948.0,,,,,,,,,,
24,https://en.wikipedia.org/wiki/1948_Arab%E2%80%...,15.0,10.0,5.0,3.0,1948.0,1949.0,,,,,,,,,,


In [44]:
# Write out the results to a csv

df_results.to_csv(output_dir + '/' + out_results, index=False)

In [8]:
### PART 3: ANALYSIS OF ALL TEMPLATES IN THE PAGES ###
### Run this part only if you need a complete analysis of the templates in the pages. It can take a long time to run! ###
# Gets all templates found in the pages needed, prints out a report about the number of occurences of all templates sorted
# Possibility to write out a csv of the contents of all templates for all pages for further analysis

df_needed = pd.read_csv(input_dir + '/' + in_needed_pages, header=None)
df_templates_raw = pd.DataFrame()

for index, row in df_needed[:pages_limit].iterrows():
    try:
        if link:
            filename = row[0]
            for char in replace_chars:
                filename = filename.replace(char, '')
            file = open(wikitexts_dir + '/' + filename +  '.wiki', 'r')
        else:
            file = open(wikitexts_dir + '/' + row[0].replace(' ', '_') +  '.wiki', 'r')
        wikitext = file.read()
    except:
        print("File open error: " + row[0])
        continue
    d_all_temps = get_all_templates(wikitext)
    d_all_temps.update({'Source_Name': row[0]})
    df_templates_raw = df_templates_raw.append(d_all_temps, ignore_index=True)


d_templates_analysis = dict()

for column in df_templates_raw.columns:
    d_templates_analysis[column] = df_templates_raw[column].count()

ser_templates_analysis = pd.Series(d_templates_analysis).sort_values(ascending=False)

print('ANALYSIS RESULTS\nTEMPLATE: NO. OF OCCURENCES')
for index, value in ser_templates_analysis.iteritems():
    if index != 'Source_Name':
        print(f'{index}: {value}; ', end='')

ANALYSIS RESULTS
TEMPLATE: NO. OF OCCURENCES
Infobox military conflict
: 161; cite web: 141; cite book: 122; cite web_1: 121; Reflist: 121; flag: 116; cite book : 110; flagicon: 106; cite web : 103; cite web_2: 103; flagicon image: 100; cite book_1: 100; Use dmy dates: 96; flag_1: 94; cite web_3: 92; cite news: 87; flagicon_1: 86; ISBN: 84; cite book _1: 83; cite book_2: 83; cite news : 82; cite web_4: 80; flagicon image_1: 78; flagicon_2: 76; cite web_5: 74; reflist: 74; Age in years, months, weeks and days: 73; cite news_1: 72; cite web _1: 70; flag_2: 70; cite book _2: 69; flagicon_3: 68; cite web_6: 68; Cite book: 68; cite book_3: 67; cite news _1: 66; flagicon image_2: 65; main: 64; citation needed: 63; cite journal : 63; cite web_7: 63; ISBN_1: 61; Cold War: 61; cite book_4: 61; flagicon_4: 61; Main: 61; cite book _3: 60; webarchive: 59; flagicon image_3: 59; cite web_8: 59; cite news_2: 58; cite web _2: 57; cite web_9: 56; Citation needed: 55; flag_3: 55; flagicon image_4: 54; K

In [10]:
df_templates_raw.head()

Unnamed: 0,Clear,Infobox militant organization\n,Reflist,Source_Name,citation needed,citation needed_1,citation needed_2,cite book,cite book.1,cite book _1,...,full citation needed_6,full citation needed_7,full citation needed_8,full citation needed_9,page needed_5,page needed_6,page needed_7,page needed_8,Liberia topics,Years in Liberia
0,[],"[(n, a, m, e, , , , , , =, , C, r, u, s,...",[],https://en.wikipedia.org/wiki/Crusaders_(guerr...,"[(d, a, t, e, =, J, u, n, e, , 2, 0, 1, 4)]","[(d, a, t, e, =, J, u, n, e, , 2, 0, 1, 4)]","[(d, a, t, e, =, J, u, n, e, , 2, 0, 1, 4)]","[(l, a, s, t, =, S, e, l, l, s), (f, i, r, s, ...","[( , l, a, s, t, =, F, i, s, c, h, e, r, ), (...","[( , l, a, s, t, , =, , L, i, č, i, n, a, )...",...,,,,,,,,,,
1,,,,https://en.wikipedia.org/w/index.php?title=194...,,,,,,,...,,,,,,,,,,
2,,,,https://en.wikipedia.org/w/index.php?title=Saf...,,,,,,,...,,,,,,,,,,
3,,,,https://en.wikipedia.org/wiki/1945_Hazara_Rebe...,,,,,,,...,,,,,,,,,,
4,,,[],https://en.wikipedia.org/wiki/Korean_conflict,,,,"[( , t, i, t, l, e, , =, , T, h, e, , M, a,...","[(a, u, t, h, o, r, =, , H, y, u, n, g, , G,...","[( , t, i, t, l, e, , =, , K, o, r, e, a, ',...",...,,,,,,,,,,


In [9]:
# The details of the templates in each of the pages can be written out into a csv

df_templates_raw[list(ser_templates_analysis.index)].to_csv(output_dir + '/' + out_analysis, index=False)