In [0]:
import xarray as xr
import numpy as np
import pandas as pd
import pickle
import os

##### Add the correct path to `000_train.nc`

In [0]:
ds = xr.open_dataset('000_train.nc')
ds

In [0]:
def create_gan_dataset(ds, time_steps):
  '''
  This function reads in the .nc file and converts it to a dataframe with the
  columns:
  1. Lat
  2. Lon
  3. Time
  4. LHFLX
  5. PRECT
  6. PS
  7. QBP
  8. SHFLX
  9. SOLIN
  10. TBP
  The data is taken for whatever number of time_steps we want to provide
  '''
  ds_prect=ds.where((ds.var_names=='PRECT') | (ds.var_names=='QBP') |\
                    (ds.var_names=='TBP') | (ds.var_names=='PS') |\
                    (ds.var_names=='SOLIN') | (ds.var_names=='SHFLX') |\
                    (ds.var_names=='LHFLX'), drop=True)
  final_df = pd.DataFrame(columns=['lat','lon','var_names','vars','time'])
  TIME_STEPS = time_steps

  # iterate over each time step to get all columns for all locations at that time
  # (1 time step = 15 min)
  for i in range(TIME_STEPS):
    ds_prect_time = ds_prect.where(ds_prect.time == i, drop=True)
    d=ds_prect_time.to_dataframe()
    # only taking the bottom 19 values for QBP and TBP (as they have the most effect on PRECT)
    a = d.groupby(['lat','lon','var_names']).tail(19)
    # then finally take the mean of QBP abd TBP over the 19 levels
    d_grouped = a.groupby(['lat','lon','var_names']).mean()
    d_grouped.reset_index(inplace=True)
    final_df = pd.concat([final_df, d_grouped])
  
  # Convert the range of Lon from (0,360) to (-180, 180)
  final_df.lon = final_df.lon-180

  # convert the data to the required format such that each variables becomes a column
  d_vars = final_df.pivot_table('vars',['lat','lon','time'],'var_names').reset_index()
  
  # store data in a pickle file for re-use
  pickle.dump(d_vars, open('data.pkl', 'wb'))
  return d_vars

#### Create the dataset

In [0]:
d_vars = create_gan_dataset(ds, 200)

In [0]:
d_vars.head()

In [0]:
d_vars.describe()

In [0]:
d_vars.shape