# Consolidating ICARTT datasets into one dataframe for TOGA and one dataframe for WAS.

In [1]:
!pip install cartopy
import pandas as pd
import matplotlib.pyplot as plt
import cartopy as ccrs
import numpy as np
import xarray as xr
import os
import time
import datetime

Collecting cartopy
  Downloading Cartopy-0.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading Cartopy-0.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cartopy
Successfully installed cartopy-0.24.1


In [2]:
# all mission dates
dates1 = ['20160729', '20160803', '20160801', '20160806', '20160808', '20160812',
         '20160815', '20160817', '20160820', '20160822', '20160823']
dates2 = ['20170126', '20170129', '20170201', '20170203', '20170205', '20170210',
           '20170213', '20170215', '20170218', '20170219', '20170221']
dates3= ['20170928', '20171001', '20171004', '20171006', '20171008', '20171011',
         '20171014', '20171017', '20171020', '20171023', '20171025', '20171027']
dates4 = ['20180424', '20180427', '20180429', '20180501', '20180503', '20180506',
          '20180509', '20180512', '20180514', '20180517', '20180519', '20180521']

In [None]:
keep_toga = [
'MSL_GPS_Altitude',
'Radar_Altitude',
'UTC_Start',
'UTC_Stop_TOGA',
'UTC_Mean_1s',
'P',
'T',
'TAS',
'U',
'V',
'W',
'lat',
'lon',
'CH2Cl2_TOGA',
'CHCl3_TOGA',
'C2Cl4_TOGA',
'ClBenzene_TOGA',
'CHBrCl2_TOGA',
'CHBr2Cl_TOGA',
'CH2Br2_TOGA',
'CHBr3_TOGA',
'CH2ClI_TOGA',
'CH2BrI_TOGA',
'CH3I_TOGA',
'CH2I2_TOGA',
'iButane_TOGA',
'nButane_TOGA',
'iPentane_TOGA',
'nPentane_TOGA',
'x2MePentane_TOGA',
'x3MePentane_TOGA',
'nHexane_TOGA',
'x224TrimePentane_TOGA',
'nHeptane_TOGA',
'iButene1Butene_TOGA',
'Isoprene_TOGA',
'Benzene_TOGA',
'Toluene_TOGA',
'EthBenzene_TOGA',
'mpXylene_TOGA',
'oXylene_TOGA',
'aPinene_TOGA',
'Tricyclene_TOGA',
'Camphene_TOGA',
'bPineneMyrcene_TOGA',
'LimoneneD3Carene_TOGA',
'CH3CHO_TOGA',
'Propanal_TOGA',
'Butanal_TOGA',
'Acrolein_TOGA',
'MAC_TOGA',
'Acetone_TOGA',
'MEK_TOGA',
'MVK_TOGA',
'CH3OH_TOGA',
'MTBE_TOGA',
'CH3CN_TOGA',
'iPropONO2_TOGA']

In [3]:
def read_icartt(icartt_file: str, flt_num: int = None, meta: dict = {},
                instr_name_prefix: bool = False, add_file_no: bool = False):
    """Parse a single ICARTT file to a pandas dataframe."""
    # Get the header row number from the ICARTT.
    with open(icartt_file, "r") as f:
        header_row = int(f.readlines()[0].split(",")[0]) - 1

    # Parse the table starting where data begins (e.g. after the header).
    df = pd.read_csv(icartt_file, header=header_row, delimiter=",")

    # Set possible error values to NaNs.
    df.replace(-9, np.NaN, inplace=True)
    df.replace(-99, np.NaN, inplace=True)
    df.replace(-999, np.NaN, inplace=True)
    df.replace(-9999, np.NaN, inplace=True)
    df.replace(-99999, np.NaN, inplace=True)

    # Strip leading/tailing white space around variable names
    df.columns = [c.strip() for c in list(df.columns)]

    # If instr_name_prefix is set to True, add the instrument name as a
    # prefix to the column names in the super merge dataframe so you know
    # which instrument collected that data (useful if multiple instruments
    # measure "NO3" and named them all "NO3". If set to false, then
    # you'd have duplicate column names in the resulting super-merge dataframe)
    if instr_name_prefix is True:
        if flt_num is not None:  # Indexed by flt # if more than 1 icartt file.
            df = df.add_prefix(meta['Instruments'][flt_num] + '_')
        else:  # Not indexed by flt # if only  1 icartt file.
            df = df.add_prefix(meta['Instruments'] + '_')

    if add_file_no is True:
        # Create a column same length as data that contains the file #
        sz = len(df[df.columns[0]])  # get appropriate length
        fnum_arr = np.full(shape=sz, fill_value=flt_num, dtype=np.int)
        df['Flight_N'] = fnum_arr

    return df  # dataframe with data

In [None]:
togas1, togas2, togas3, togas4 = [], [], [], []

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-TOGA_Dataset.20210613_1/atom1'
directory_files = os.listdir(directory_path)
for file in directory_files:
    togas1.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-TOGA_Dataset.20210613_1/atom2'
directory_files = os.listdir(directory_path)
for file in directory_files:
    togas2.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-TOGA_Dataset.20210613_1/atom3'
directory_files = os.listdir(directory_path)
for file in directory_files:
    togas3.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-TOGA_Dataset.20210613_1/atom4'
directory_files = os.listdir(directory_path)
for file in directory_files:
    togas4.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

In [None]:
# drop all NaN and useless columns
for i in range(len(togas1)):
  togas1[i] = togas1[i][keep_toga]
  togas1[i] = togas1[i].dropna(axis=1, how='all')

for i in range(len(togas2)):
  togas2[i] = togas2[i][keep_toga]
  togas2[i] = togas2[i].dropna(axis=1, how='all')

for i in range(len(togas3)):
  togas3[i] = togas3[i][keep_toga]
  togas3[i] = togas3[i].dropna(axis=1, how='all')

for i in range(len(togas4)):
  togas4[i] = togas4[i][keep_toga]
  togas4[i] = togas4[i].dropna(axis=1, how='all')

In [None]:
# convert all toga mission dates to Unix timestamp
for i in range(len(dates1)):
  year = dates1[i][0:4]
  month = dates1[i][4:6]
  day = dates1[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  togas1[i].UTC_Start += np.ones(len(togas1[i])) * unix
  togas1[i].UTC_Stop_TOGA += np.ones(len(togas1[i])) * unix
  togas1[i].UTC_Mean_1s += np.ones(len(togas1[i])) * unix
  togas1[i]['mission'] = np.ones(len(togas1[i])) * 1

for i in range(len(dates2)):
  year = dates2[i][0:4]
  month = dates2[i][4:6]
  day = dates2[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  togas2[i].UTC_Start += np.ones(len(togas2[i])) * unix
  togas2[i].UTC_Stop_TOGA += np.ones(len(togas2[i])) * unix
  togas2[i].UTC_Mean_1s += np.ones(len(togas2[i])) * unix
  togas2[i]['mission'] = np.ones(len(togas2[i])) * 2

for i in range(len(dates3)):
  year = dates3[i][0:4]
  month = dates3[i][4:6]
  day = dates3[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  togas3[i].UTC_Start += np.ones(len(togas3[i])) * unix
  togas3[i].UTC_Stop_TOGA += np.ones(len(togas3[i])) * unix
  togas3[i].UTC_Mean_1s += np.ones(len(togas3[i])) * unix
  togas3[i]['mission'] = np.ones(len(togas3[i])) * 3

for i in range(len(dates4)):
  year = dates4[i][0:4]
  month = dates4[i][4:6]
  day = dates4[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  togas4[i].UTC_Start += np.ones(len(togas4[i])) * unix
  togas4[i].UTC_Stop_TOGA += np.ones(len(togas4[i])) * unix
  togas4[i].UTC_Mean_1s += np.ones(len(togas4[i])) * unix
  togas4[i]['mission'] = np.ones(len(togas4[i])) * 4

In [None]:
# consolidate togas dataframe
togas = togas1[0]
for i in range(1, len(togas1)):
  togas = pd.concat([togas, togas1[i]])

for i in range(len(togas2)):
  togas = pd.concat([togas, togas2[i]])

for i in range(len(togas3)):
  togas = pd.concat([togas, togas3[i]])

for i in range(len(togas4)):
  togas = pd.concat([togas, togas4[i]])

In [None]:
# export togas dataframe
togas.to_csv('togas2.csv')

## Start of WAS data consolidation

In [9]:
keep_was = [
'UTC_Start',
'UTC_Stop_WAS',
'UTC_Mean_1s',
'MSL_GPS_Altitude',
'P',
'T',
'TAS',
'U',
'V',
'W',
'lat',
'lon',
'OCS_WAS',
'DMS_WAS',
'CFC12_WAS',
'CFC11_WAS',
'CFC113_WAS',
'CFC114_WAS',
'HFC134a_WAS',
'HCFC22_WAS',
'HCFC142b_WAS',
'HCFC141b_WAS',
'H1301_WAS',
'H2402_WAS',
'H1211_WAS',
'CH3CCl3_WAS',
'CCl4_WAS',
'CHCl3_WAS',
'CH2Cl2_WAS',
'C2Cl4_WAS',
'CH3Cl_WAS',
'CH3Br_WAS',
'CH3I_WAS',
'CH2Br2_WAS',
'CHBrCl2_WAS',
'CHBr2Cl_WAS',
'CHBr3_WAS',
'CH2ClCH2Cl_WAS',
'MeONO2_WAS',
'EthONO2_WAS',
'iPropONO2_WAS',
'nPropONO2_WAS',
'x2ButONO2_WAS',
'x3PentONO2_WAS',
'x2PentONO2_WAS',
'x3Me2ButONO2_WAS',
'Ethane_WAS',
'Ethene_WAS',
'Ethyne_WAS',
'Propane_WAS',
'Propene_WAS',
'iButane_WAS',
'nButane_WAS',
'iPentane_WAS',
'nPentane_WAS',
'Isoprene_WAS',
'nHexane_WAS',
'nHeptane_WAS',
'x2MePentane_WAS',
'x3MePentane_WAS',
'Benzene_WAS',
'Toluene_WAS',
'EthBenzene_WAS',
'mpXylene_WAS',
'oXylene_WAS',
'O3_CL',
'O3_CL_2sigma',
'O3_GMI',
'OH_GMI',
'HCFC22_GMI',
'CFC11_GMI'
]

In [4]:
# bring in was data
was1, was2, was3, was4 = [], [], [], []

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-WAS_Dataset.20210613_1/atom1'
directory_files = os.listdir(directory_path)
for file in directory_files:
    was1.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-WAS_Dataset.20210613_1/atom2'
directory_files = os.listdir(directory_path)
for file in directory_files:
    was2.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-WAS_Dataset.20210613_1/atom3'
directory_files = os.listdir(directory_path)
for file in directory_files:
    was3.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

directory_path = '/content/drive/MyDrive/Colab Notebooks/Atmos_Chem/ATom_MER-WAS_Dataset.20210613_1/atom4'
directory_files = os.listdir(directory_path)
for file in directory_files:
    was4.append(read_icartt(os.path.join(directory_path, file)).dropna(axis=1, how='all'))

In [10]:
# drop all NaN and useless columns
for i in range(len(was1)):
  was1[i] = was1[i][keep_was]
  was1[i] = was1[i].dropna(axis=1, how='all')

for i in range(len(was2)):
  was2[i] = was2[i][keep_was]
  was2[i] = was2[i].dropna(axis=1, how='all')

for i in range(len(was3)):
  was3[i] = was3[i][keep_was]
  was3[i] = was3[i].dropna(axis=1, how='all')

for i in range(len(was4)):
  was4[i] = was4[i][keep_was]
  was4[i] = was4[i].dropna(axis=1, how='all')

In [11]:
# convert all was mission dates to Unix timestamp
for i in range(len(dates1)):
  year = dates1[i][0:4]
  month = dates1[i][4:6]
  day = dates1[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  was1[i].UTC_Start += np.ones(len(was1[i])) * unix
  was1[i].UTC_Stop_WAS += np.ones(len(was1[i])) * unix
  was1[i].UTC_Mean_1s += np.ones(len(was1[i])) * unix
  was1[i]['mission'] = np.ones(len(was1[i])) * 1

for i in range(len(dates2)):
  year = dates2[i][0:4]
  month = dates2[i][4:6]
  day = dates2[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  was2[i].UTC_Start += np.ones(len(was2[i])) * unix
  was2[i].UTC_Stop_WAS += np.ones(len(was2[i])) * unix
  was2[i].UTC_Mean_1s += np.ones(len(was2[i])) * 1
  was2[i]['mission'] = np.ones(len(was2[i])) * 2


for i in range(len(dates3)):
  year = dates3[i][0:4]
  month = dates3[i][4:6]
  day = dates3[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  was3[i].UTC_Start += np.ones(len(was3[i])) * unix
  was3[i].UTC_Stop_WAS += np.ones(len(was3[i])) * unix
  was3[i].UTC_Mean_1s += np.ones(len(was3[i])) * 1
  was3[i]['mission'] = np.ones(len(was3[i])) * 3

for i in range(len(dates4)):
  year = dates4[i][0:4]
  month = dates4[i][4:6]
  day = dates4[i][6:]
  dt = datetime.datetime(int(year), int(month), int(day))
  unix = time.mktime(dt.timetuple())
  was4[i].UTC_Start += np.ones(len(was4[i])) * unix
  was4[i].UTC_Stop_WAS += np.ones(len(was4[i])) * unix
  was4[i].UTC_Mean_1s += np.ones(len(was4[i])) * 1
  was4[i]['mission'] = np.ones(len(was4[i])) * 4

In [12]:
# consolidate togas dataframe
was = was1[0]
for i in range(1, len(was1)):
  was = pd.concat([was, was1[i]])

for i in range(len(was2)):
  was = pd.concat([was, was2[i]])

for i in range(len(was3)):
  was = pd.concat([was, was3[i]])

for i in range(len(was4)):
  was = pd.concat([was, was4[i]])

In [13]:
# export
was.to_csv('was3.csv')