In [1]:
from netCDF4 import Dataset
import os

from netCDFfunc.utility import *
import pandas as pd

In [2]:
base_dir = '/Volumes/T7/download_data'
for data_name in os.listdir(base_dir) :
    data_folder = os.path.join(base_dir, data_name)
    data_file = os.path.join(data_folder, os.listdir(data_folder)[-2])
    print(os.path.getsize(data_file) // 1024)
    ds = Dataset(data_file, 'r', format='NETCDF4')
    
    ds.close()

897
6875
150278
1013
1834
732974
6695
848
38017
18606
18237
151921


In [3]:
base_dir = '/Volumes/T7/AVHRR_OI_SST/v2.1'

In [4]:
value_1 = get_data_by_date(base_dir = base_dir,
                           get_data_func = get_data_A,
                           var_name = 'sst',
                           start_date = (1981, 9, 1),
                           end_date = (2011,8,31),
                           specific_date = (1,1))

  0%|          | 0/31 [00:00<?, ?it/s]

In [5]:
data = value_1[(1,1)][0]

In [64]:
def cropping(arr, region, grid_size):
    ratio = 0.25 / grid_size
    
    if region == 'rok':
        return arr[int(440 * ratio):int(572 * ratio), int(440 * ratio):int(600 * ratio)]
    if region == 'nw':
        return arr[int(280 * ratio):int(624 * ratio), int(392 * ratio):int(1136 * ratio)]
    if region == 'global':
        return arr

In [31]:
def nc_write(ds_new, title, comment, grid_size, 
                    core_variable_name, core_variable_standard_name, core_variable_unit, core_variable_dtype, core_variable_values,
                    lat_range=(0,None), lon_range=(0,None)):
    
    # set attribute
    now = dt.datetime.now()
    attr_dict = {'title' : title,
                 'grid' : f'{grid_size}',
                 'institution' : 'NIA',
                 'name_creator' : 'BNT',
                 'date_creation' : now.strftime('%Y-%m-%d %H:%M:%S'),
                 'comment' : comment}

    for k, v in attr_dict.items():
        ds_new.setncattr(k,v)

    lat_s, lat_e = lat_range  # 1358, 1852
    lon_s, lon_e = lon_range
    
    ratio = 0.25/grid_size 
    lat_s, lat_e = int(lat_s*ratio), int(lat_e*ratio)
    lon_s, lon_e = int(lon_s*ratio), int(lon_e*ratio)
    
    lat_force_cut = None
    lon_force_cut = None
    
    print(core_variable_values.shape)
    if grid_size == 0.081 : 
        lat_force_cut = -1
        lon_force_cut = -1
    if grid_size == 0.054:
        lat_force_cut = -1
    if grid_size == 0.08789 :
        lat_force_cut = -1
        lon_force_cut = -1
        
       
    lat_grid = np.arange(-90 + (grid_size/2), 90 + (grid_size/2), grid_size)[:lat_force_cut][lat_s:lat_e]
    lon_grid = np.arange(0 + (grid_size/2), 360 + (grid_size/2), grid_size)[:lon_force_cut][lon_s:lon_e]
    print(len(lat_grid))
    print(len(lon_grid))
    
    # set dimension
    dim_dict = {'ntime' : 1,
                'nlat' : len(lat_grid),
                'nlon' : len(lon_grid)}
    
    for k, v in dim_dict.items():
        ds_new.createDimension(k,v)

    # set variables
    for variable_name in ['time', 'lat', 'lon', core_variable_name]:

        if variable_name == 'time' :
            variable_attribute = {'standard_name' : 'time',
                                  'format' : 'Mdd',
                                  'axis' : 'T'}
            dtype = np.int16
            dimensions = ('ntime',)
            variable_values = 101

        if variable_name == 'lat' :
            variable_attribute = {'standard_name' : 'latitude',
                                  'units' : 'degrees',
                                  'axis' : 'Y'}
            dtype = np.float32
            dimensions = ('nlat',)
            variable_values = lat_grid

        if variable_name == 'lon' :
            variable_attribute = {'standard_name' : 'longitude',
                                  'units' : 'degrees',
                                  'axis' : 'X'}
            dtype = np.float32
            dimensions = ('nlon',)
            variable_values = lon_grid
            
        if variable_name == core_variable_name :
            variable_attribute  = {'standard_name' : core_variable_standard_name,
                                   'units' : core_variable_unit}
            dtype = core_variable_dtype
            dimensions = ('ntime', 'nlat', 'nlon',)
            variable_values = core_variable_values#[lat_s:lat_e, lon_s:lon_e]


        fill_value = -999

        ds_new = create_new_variable(ds_new,
                                     new_variable_name=variable_name,  
                                     dtype=dtype,
                                     dimension=dimensions,
                                     fill_value=fill_value,
                                     values=variable_values,
                                     attributes=variable_attribute)

    return ds_new

In [8]:
base_dir = '/Volumes/T7/new_data/file_size_test/'
file_name = 'global'

grid_size = 0.10
nc_path = os.path.join(base_dir, 'nc' ,file_name+f'_{grid_size}.nc')

ds_new = Dataset(nc_path, 'w', format='NETCDF4')
title = 'Global 30 years(1981~2011) SST average data'
comment = 'calculation 1981/9/1 ~ 2011/8/31'

data = np.mean(value_1[(1,1)], axis=0)
ratio = 0.25 / grid_size
data = ndimage.zoom(data, ratio, order=0) # nearest interpolation

variable_name = 'avgsst'
variable_standard_name = 'averageSST'
variable_unit = 'degree C'
variable_dtype = np.float32
variable_values = data

ds_new = nc_write(ds_new, title, comment, grid_size,
                                 variable_name, 
                                 variable_standard_name, 
                                 variable_unit, 
                                 variable_dtype, 
                                 variable_values)

ds_new.close()

In [7]:
data.shape

(2048, 4096)

In [38]:
ds_new.close()

In [105]:
base_dir = '/Volumes/T7/new_data/file_size_test/'
file_name = 'nwp'

grid_size = 0.25
nc_path = os.path.join(base_dir, 'nc' ,file_name+f'_{grid_size}.nc')

ds_new = Dataset(nc_path, 'w', format='NETCDF4')
title = 'North-West Pacific 30 years(1981~2011) SST average data'
comment = 'calculation 1981/9/1 ~ 2011/8/31'


lat_range = (280, 624)
lon_range = (392, 1136)

data = np.mean(value_1[(1,1)], axis=0)
ratio = 0.25 / grid_size
data = ndimage.zoom(data, ratio, order=0) # nearest interpolation
data = cropping(data, 'nw', grid_size)

variable_name = 'avgsst'
variable_standard_name = 'averageSST'
variable_unit = 'degree C'
variable_dtype = np.float32
variable_values = data

ds_new = nc_write(ds_new, title, comment, grid_size,
                         variable_name, 
                         variable_standard_name, 
                         variable_unit, 
                         variable_dtype, 
                         variable_values,
                         lat_range,
                         lon_range)

ds_new.close()

(344, 744)
344
744


In [32]:
ds_new.close()

In [48]:
0.25/0.054 * 160

740.7407407407408

In [63]:
(600-440) * (0.25/0.054)

740.7407407407408

In [None]:
132, 160

In [59]:
data.shape

(611, 741)

In [77]:
base_dir = '/Volumes/T7/new_data/file_size_test/'
file_name = 'rok'

grid_size = 0.08789
nc_path = os.path.join(base_dir, 'nc' ,file_name+f'_{grid_size}.nc')

ds_new = Dataset(nc_path, 'w', format='NETCDF4')
title = 'Republic Of Korea Vicinity 30 years(1981~2011) SST average data'
comment = 'calculation 1981/9/1 ~ 2011/8/31'

lat_range = (440, 572)
lon_range = (440, 600)

data = np.mean(value_1[(1,1)], axis=0)
ratio = 0.25 / grid_size
data = ndimage.zoom(data, ratio, order=0) # nearest interpolation
data = cropping(data, 'rok', grid_size)

variable_name = 'avgsst'
variable_standard_name = 'averageSST'
variable_unit = 'degree C'
variable_dtype = np.float32
variable_values = data

ds_new = nc_write(ds_new, title, comment, grid_size,
                         variable_name, 
                         variable_standard_name, 
                         variable_unit, 
                         variable_dtype, 
                         variable_values,
                         lat_range,
                         lon_range)

ds_new.close()

(376, 455)
376
455


In [6]:
base_dir = '/Volumes/T7/new_data/file_size_test/nc'
dic = dict()

for data_name in tqdm(os.listdir(base_dir)) :
    data_file = os.path.join(base_dir, data_name)
    region, grid = data_name.replace('.nc','').split('_')
    
    print(grid)
    if grid != '0.01':
        continue
    
    if dic.get(region) == None :
        dic[region] = dict()
    dic[region][grid] = round(os.path.getsize(data_file) / 1024,1)
    
    ds = Dataset(data_file, 'r', format='NETCDF4')
    
    data = ds['avgsst'][:].data
    lat_range = ds['lat'][:]
    lon_range = ds['lon'][:]
    
    s_index = 0
    e_index = len(lat_range) // 3
    
    for n in range(3):
        d_stack = []

        if e_index >= len(lat_range) :
            e_index = len(lat_range)
        for i in tqdm(range(s_index, e_index)):
            for j in range(len(lon_range)):
#                 if data[0][i][j] != -999:
                d_stack.append((lat_range[i], lon_range[j], data[0][i][j]))
                
        s_index = e_index
        e_index += len(lat_range) // 3
    
        df = pd.DataFrame(d_stack, columns=['lat', 'lon', 'sst'])
        df.to_csv(f'/Volumes/T7/new_data/file_size_test/csv/{region}_{grid}{n}.csv', index=False)
        
    ds.close()

  0%|          | 0/22 [00:00<?, ?it/s]

0.25
0.25
0.25
0.01


  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

0.05
0.054
0.081
0.08789
0.1
0.01


  0%|          | 0/2866 [00:00<?, ?it/s]

  0%|          | 0/2866 [00:00<?, ?it/s]

  0%|          | 0/2866 [00:00<?, ?it/s]

0.05
0.1
0.081
0.054
0.08789
0.01


  0%|          | 0/1100 [00:00<?, ?it/s]

  0%|          | 0/1100 [00:00<?, ?it/s]

  0%|          | 0/1100 [00:00<?, ?it/s]

0.05
0.1
0.054
0.081
0.08789
file.zip


In [108]:
pd.DataFrame(dic)

Unnamed: 0,global,nwp,rok
0.25,4069.6,1013.8,93.7
0.01,2531472.1,624961.2,51602.2
0.05,101303.3,25026.2,2079.4
0.054,86851.4,21454.4,1782.6
0.081,38609.7,9544.3,797.0
0.08789,32803.1,8107.0,681.4
0.1,25344.7,6270.2,528.8


In [175]:
data_name

'._global_0.25.csv'

In [5]:
base_dir = '/Volumes/T7/new_data/file_size_test/csv'
dic = dict()

for data_name in tqdm(os.listdir(base_dir)) :
    d_stack = []
    data_file = os.path.join(base_dir, data_name)
    region, grid = data_name.replace('.csv','').split('_')
    
    print(grid)
    if grid == '0.01':
        continue
    
    if dic.get(region) == None :
        dic[region] = dict()
    dic[region][grid] = os.path.getsize(data_file) / 1024 / 1024

  0%|          | 0/20 [00:00<?, ?it/s]

0.25
0.25
0.25
file.zip
0.05
0.054
0.081
0.08789
0.1
0.05
0.1
0.081
0.054
0.08789
0.05
0.1
0.054
0.081
0.08789


ValueError: too many values to unpack (expected 2)

In [14]:
(86722094 + 65874973 + 28803700) / 1024 / 1024

172.99725246429443

In [7]:
base_dir = '/Volumes/T7/new_data/file_size_test/csv'
dic = dict()

for data_name in tqdm(os.listdir(base_dir)) :
    d_stack = []
    data_file = os.path.join(base_dir, data_name)
    region, grid = data_name.replace('.csv','').split('_')
    
    print(grid)
    if grid == '0.01':
        continue
    
    if dic.get(region) == None :
        dic[region] = dict()
    dic[region][grid] = os.path.getsize(data_file) / 1024 / 1024

  0%|          | 0/29 [00:00<?, ?it/s]

0.25
0.25
0.25
0.010
file.zip
0.05
0.054
0.081
0.08789
0.1
0.05
0.1
0.081
0.054
0.08789
0.05
0.1
0.054
0.081
0.08789


ValueError: too many values to unpack (expected 2)

In [189]:
pd.DataFrame(dic)

Unnamed: 0,nwp,global,rok
0.25,4.371607,16.446783,0.275848
0.05,109.386656,411.234356,11.621775
0.1,25.117818,94.587078,1.587106
0.081,45.068994,169.212512,2.838483
0.054,93.79917,352.59125,5.911266
0.08789,42.410763,158.8857,2.687705


In [126]:
pd.DataFrame(d_stack, columns=['lat', 'lon', 'sst'])

Unnamed: 0,lat,lon,sst
0,-78.375,165.875,-1.502000
1,-78.375,166.125,-1.494667
2,-78.375,166.375,-1.480000
3,-78.375,166.625,-1.458000
4,-78.375,166.875,-1.426667
...,...,...,...
691145,89.875,358.875,-1.730333
691146,89.875,359.125,-1.730333
691147,89.875,359.375,-1.730333
691148,89.875,359.625,-1.730333


In [135]:
ds['avgsst'][:].shape

(1, 3333, 6667)