In [1]:
import pandas as pd
import numpy as np
import wikipedia
import urllib as urllib
import urllib.request, urllib.parse, urllib.error
import mwparserfromhell
import pywikibot

In [2]:
missions = pd.read_csv('Mission_names.csv', delimiter='\t', encoding='utf-8')
missions = missions['Vehicle']

PART 1: Retreiving Infobox Data for Mission Telemetry Data

The following cell is a slow-running function which utilizes a wikipedia library. The a.url function accesses the Wikipedia search function to get exact URL text from titles which may not match exactly what the mission title dataset included (for example: "Gemini IV" becomes "Gemini_4" or "STS-51D" becomes "STS-51-D" to exactly match the wikipedia title)

This is a step which cleans the data of the mission title list for later accessing the infoboxes, which requires page title strings to match exactly

In [3]:
#This cell takes some time to process
mission_count = len(missions) #this int will be used for later counting functions
mission_urls = [] #Initializes list for ursl

for i in range(mission_count):
    
    #Calls the wikipedia library. This searches for the page if the input text is not a precise match
    a = wikipedia.page(missions[i]) 
    
    #retrieves the url
    missionurl = a.url 
    
    #appends the url string to the list
    mission_urls.append(missionurl) 


The content function retrieves data from the rest of a wikipedia page. 

**Not used here, but preserved for later use
                #content = a.content
                #content.splitlines()**
         
The following loop interates through the url strings and removes all but the mission name exactly as wikipedia sees it. This is necessary because this process interacts with multiple wikipedia data reteeival libraries to get the infobox data as we need it.

In [4]:
#Generates a list of mission names exactly as wikipedia recognizes them
wikipedia_formatted_titles = []
for string in mission_urls:
    string = string.strip('https://en.wikipedia.org/wiki/')
    wikipedia_formatted_titles.append(string)

In [5]:
#Prints the list to verify the correctness of the names recovered by the wikipedia library.
wikipedia_formatted_titles[0:5]

['Gemini_4', 'Gemini_8', 'Gemini_9A', 'Gemini_10', 'Gemini_11']

The names of each category retreived from the infobox are placed here as text for convenience (i.e. you can copy the text here and paste other code around it elsewhere). A function to condence this process would require the conversion from a variable name to a string and several StackExchange question responses indicated the improperness of defining python variables through a function, instead of directly. 

date
mission_type
mission_duration
orbits_completed
spacecraft
launch_mass
landing_mass
launch_rocket
launch_site
landing_date
landing_site
orbit_epoch
orbit_reference
orbit_regime
orbit_periapsis
orbit_apoapsis
orbit_inclination
orbit_period
apsis
crew_size
crew_members

In [6]:
#Initializes Lists of the categories to be pulled from the data
#A better version of this could accept the category names as strings which append columns within a DataFrame, not items in a list
date = []   
mission_type = []
mission_duration = []
orbits_completed = []
spacecraft = []
launch_mass = []
landing_mass = []
launch_rocket = []
launch_site = []
landing_date = []
landing_site = []
orbit_epoch = []
orbit_reference = []
orbit_regime = []
orbit_periapsis = []
orbit_apoapsis = []
orbit_inclination = []
orbit_period = []
apsis = []
crew_size = []
crew_members = []

In [7]:
#This function populates the raw data fields.
#It checks if the category exists in the page template and appends the raw data if it does and 'NaN' if not
def addCategoryData(mission_dictionary, category_string, category_data):
    if category_string in mission_dictionary:
        category_data.append(mission_dict[category_string])
    elif category_string not in mission_dictionary:
        category_data.append('NaN')

In [8]:
#https://stackoverflow.com/questions/8088226/content-of-infobox-of-wikipedia

for i in range(len(wikipedia_formatted_titles)):
    
    #pywikibot accesses the Wikipedia API in English
    enwp = pywikibot.Site('en','wikipedia') 
    
    #Retreives the page for the given page title
    page = pywikibot.Page(enwp, wikipedia_formatted_titles[i]) 
    
    #Retreives the entire page
    wikitext = page.get() 
    
    #mwparserfromhell parses the page for raw wikicode data
    wikicode = mwparserfromhell.parse(wikitext)
    
    #filters the raw code for ALL embedded templates including the Infobox
    templates = wikicode.filter_templates() 


    namelist = [] #Initializes a list which will contain ALL template field names
    valuelist = [] #Initializes a list which will contain ALL template value names
    templatecount = 5 #Each page has a unique number of JSON data templates. This value is the max number of the
                      #JSON templates that should be iterated through to ensure we always get to whichever
                      #template contains the Infobox. Reducing this value would shorten the run-time, but risk
                      # missing an infobox in the next step

    for i in range(templatecount):
        
        #Creates a new variable from a current mission's template
        infobox_film = templates[i]  
        
        #Iterates through the parameters of the current page's JSON template
        for param in infobox_film.params: 
            namelist.append(param.name)
            valuelist.append(param.value)

    #initializes a dict which will each mission's categories and data.
    mission_dict = {} 
    
    #Iterates through each name + value line. 
    for i in range(len(namelist)): 
        namelist[i] = namelist[i].strip(' ') 
        
        #Adds a dictionary entry with the given string as the dict category and the correspondeng value
        mission_dict[namelist[i]] = valuelist[i] 
    
    #addCategoryData(mission_dict, 'field_name', field_name)
    addCategoryData(mission_dict, 'mission_type', mission_type)
    addCategoryData(mission_dict, 'mission_duration', mission_duration)
    addCategoryData(mission_dict, 'orbits_completed', orbits_completed)
    addCategoryData(mission_dict, 'spacecraft', spacecraft)
    addCategoryData(mission_dict, 'launch_mass', launch_mass)             
    addCategoryData(mission_dict, 'landing_mass', landing_mass)             
    addCategoryData(mission_dict, 'launch_rocket', launch_rocket)                
    addCategoryData(mission_dict, 'launch_site', launch_site)                
    addCategoryData(mission_dict, 'landing_date', landing_date)
    addCategoryData(mission_dict, 'landing_site', landing_site)                
    addCategoryData(mission_dict, 'orbit_epoch', orbit_epoch)                   
    addCategoryData(mission_dict, 'orbit_reference', orbit_reference)                
    addCategoryData(mission_dict, 'orbit_regime', orbit_regime)                   
    addCategoryData(mission_dict, 'orbit_periapsis', orbit_periapsis)                
    addCategoryData(mission_dict, 'orbit_apoapsis', orbit_apoapsis)
    addCategoryData(mission_dict, 'orbit_inclination', orbit_inclination)                
    addCategoryData(mission_dict, 'orbit_period', orbit_period)
    addCategoryData(mission_dict, 'apsis', apsis)
    addCategoryData(mission_dict, 'crew_size', crew_size)
    addCategoryData(mission_dict, 'crew_members', crew_members)                              

Append the raw data to a dataframe to clean the data from there.

In [9]:
type(missions)
df = pd.DataFrame(missions)

#Assembles a DataFrame of all the uncleaned data
df['wikipedia_name'] = pd.Series(wikipedia_formatted_titles)

df['wikipedia_url'] = pd.Series(mission_urls)

df['mission_type'] = pd.Series(mission_type)

df['mission_duration'] = pd.Series(mission_duration)

df['orbits_completed'] = pd.Series(orbits_completed)

df['spacecraft'] = pd.Series(spacecraft)

df['launch_mass'] = pd.Series(launch_mass)

df['landing_mass'] = pd.Series(landing_mass)

df['launch_rocket'] = pd.Series(launch_rocket)

df['launch_site'] = pd.Series(launch_site)

df['landing_date'] = pd.Series(landing_date)

df['landing_site'] = pd.Series(landing_site)

df['orbit_epoch'] = pd.Series(orbit_epoch)

df['orbit_reference'] = pd.Series(orbit_reference)

df['orbit_regime'] = pd.Series(orbit_regime)

df['orbit_periapsis'] = pd.Series(orbit_periapsis)

df['orbit_apoapsis'] = pd.Series(orbit_apoapsis)

df['orbit_inclination'] = pd.Series(orbit_inclination)

df['orbit_period'] = pd.Series(orbit_period)

df['apsis'] = pd.Series(apsis)

df['crew_size'] = pd.Series(crew_size)

df['crew_members'] = pd.Series(crew_members)

     

In [10]:
#Produces a CSV which will be read and cleaned on the next step
df.to_csv('mission_data_raw.csv', encoding='utf-8') 

Proceed to 2_data_cleaning_and_merging