In [None]:
# import required libraries
import pandas as pd
import numpy as np
import requests

In [None]:
import datetime

In [None]:
spacex_url = "https://api.spacexdata.com/v4/launches/past"

In [None]:
response = requests.get(spacex_url)

In [None]:
print(response.status_code)

In [None]:
## print the raw that response.get() extracted from spacex API.
# print(response.text)
#  (or)
# print(response.json())

In [None]:
# assigning Response data to a variable which is in JSON.
data = response.json()

##### The data that is collected previously is dynamic but I want to use the static data for this project as a learner. I will use the static JSON data provided by COURSERA.

In [None]:
spacex_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json"

In [None]:
response = requests.get(spacex_url)

In [None]:
print(response.status_code)

In [None]:
data = response.json()

In [None]:
# Print the JSON data.
# print(data)

##### The JSON data contain nested structure. So, we use json_normalize method to reduce errors while using DataFrame.

In [None]:
df = pd.json_normalize(data)

In [None]:
# First 5 rows of the DataFrame.
# df.head()

In [None]:
# columns in the data frame.
# df.columns

In [None]:
# data types of each column
# df.dtypes

In [None]:
# shape of the dataset.
# df.shape

##### Filtering the dataset by keeping the required columns for the project.

In [None]:
df = df[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

In [None]:
# Removing rows with multiple cores(rockets with extra boosters).
df = df[df['cores'].map(len) == 1]
df = df[df['payloads'].map(len) == 1]

In [None]:
# Both payloads and cores have lists as data, extracting the single value from them.
df['cores'] = df['cores'].map(lambda x : x[0])
df['payloads'] = df['payloads'].map(lambda x : x[0])

In [None]:
# Date columns contains both date and time, we need to extract date.
df['date'] = pd.to_datetime(df['date_utc']).dt.date

In [None]:
# Filtering the launches happened before 2020-11-13.
df = df[df['date'] <= datetime.date(2020,11,13)]

In [None]:
# df.tail()
df.shape

#### The dataset contains the data in encrypted form, we need to convert them into human readable form. 

##### We need to create some helper functions to use API to extract information using these encrypted data.

In [None]:
# Variables that store the extracted data and helps to create a new dataframe.
# Global Variables.
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

In [None]:
''' HELPER FUNCTIONS'''

# From ROCKET column we can extract booster name.
def getBoosterVersion(data):
    for x in data['rocket']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
            BoosterVersion.append(response['name'])

# From LAUNCHPAD column we can extract coordinates and launchsite name.
def getLaunchSite(data):
    for x in data['launchpad']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json()
            Longitude.append(response['longitude'])
            Latitude.append(response['latitude'])
            LaunchSite.append(response['name'])

# From PAYLOAD column we extract payload mass and orbit name.
def getPayLoadData(data):
    for load in data['payloads']:
        if load:
            response = requests.get("https://api.spacexdata.com/v4/payloads/"+str(load)).json()
            PayloadMass.append(response['mass_kg'])
            Orbit.append(response['orbit'])

# From CORES column we can extract various data.
def getCoreData(data):
    for core in data['cores']:
            if core['core'] != None:
                response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
                Block.append(response['block'])
                ReusedCount.append(response['reuse_count'])
                Serial.append(response['serial'])
            else:
                Block.append(None)
                ReusedCount.append(None)
                Serial.append(None)
            Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
            Flights.append(core['flight'])
            GridFins.append(core['gridfins'])
            Reused.append(core['reused'])
            Legs.append(core['legs'])
            LandingPad.append(core['landpad'])

In [None]:
# Check whether the boosterversion list is empty or not.
BoosterVersion

In [None]:
df

In [None]:
# Apply getBoosterVersion function method to dataframe.
getBoosterVersion(df)

In [None]:
# len(BoosterVersion)

In [None]:
# Call getLaunchSite function.
getLaunchSite(df)

In [None]:
# len(Latitude)

In [None]:
# Call getPayloadData function.
getPayLoadData(df)

In [None]:
# len(PayloadMass)

In [None]:
# Call getCoreData function.
getCoreData(df)

#### Create a New DATA FRAME with the help of extracted data.

In [None]:
# dictionary to store the information of all variables.
launch_dict = {
    'FlightNumber':list(df['flight_number']),
    'Date': list(df['date']),
    'BoosterVersion':BoosterVersion,
    'PayloadMass':PayloadMass,
    'Orbit':Orbit,
    'LaunchSite':LaunchSite,
    'Outcome':Outcome,
    'Flights':Flights,
    'GridFins':GridFins,
    'Reused':Reused,
    'Legs':Legs,
    'LandingPad':LandingPad,
    'Block':Block,
    'ReusedCount':ReusedCount,
    'Serial':Serial,
    'Longitude': Longitude,
    'Latitude': Latitude
}

In [None]:
# Check the length of each column.
ls = list(launch_dict.keys())
for i in range(len(launch_dict)):
    print(ls[i],len(launch_dict[ls[i]]))

In [None]:
# Create a new DataFrame from launch_dict dictionary.
launch_df = pd.DataFrame(launch_dict)

In [None]:
# First 5 rows.
launch_df.head()

##### Filtering DataFrame to only include Falcon 9 launches.

In [None]:
data_falcon9 = launch_df[launch_df['BoosterVersion'] != 'Falcon 1']

In [None]:
# Reset the flight number column as some records are removed.
data_falcon9.loc[:, 'FlightNumber'] = list(range(1, data_falcon9.shape[0]+1))
# data_falcon9

#### Data Wrangling

In [None]:
data_falcon9.isnull().sum()

##### We need to handle the missing values in PayloadMass. As for landingPad we need null values to represent when landing pads were not used.

In [None]:
''' Dealing with missing values in PayloadMass column. '''

# calculate the mean value of PayloadMass.
payload_mean = data_falcon9['PayloadMass'].mean()
# Replae np.nan values with its mean value.
data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].fillna(payload_mean)

In [None]:
data_falcon9.head()

In [None]:
data_falcon9.to_csv('dataset_part_1.csv', index = False)