# Creating and Packaging GeoDataFrame from Data

## Import Dependencies

In [1]:
import pandas as pd 
import re
from pathlib import Path
import shapely
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from glob import glob
tqdm.pandas();

## Define Directory Paths
### Input Directories

In [2]:
train_dir = Path('/home/data/train/')
test_dir = Path('/home/data/test_public/')
sample_dir = Path('/home/data/sample/')

### Output Paths

In [3]:
output_path = Path.cwd().parent / 'data/'
output_csv_path = output_path/'output_csvs/'
Path(output_csv_path).mkdir(parents=True, exist_ok=True)

In [4]:
""" def extract_metadata_from_string(string):
    # extracted groups
    # full path - image_dir_name - sub_dir_name - fname - year - month - data_type - extension
    string_mas='../'+string
    pattern = r'/(t.+|sample)/(L.+)/(\w+)/(.+_(\d+)_(\d+)_m.+_\d+_\d+_\d+)(?:_(\w+))?.(\w+)'
    filter= string_mas.find('DS_Store') 
    if filter >= 0:
        return 'DS'
    else:
        match = re.findall(pattern=pattern,string=string_mas)
        return match[0] """
    
def extract_metadata_from_string(string):
    # extracted groups
    # full path - image_dir_name - sub_dir_name - fname - year - month - data_type - extension
    pattern = r'/(t.+|sample)/(L.+)/(\w+)/(.+_(\d+)_(\d+)_m.+_\d+_\d+_\d+)(?:_(\w+))?.(\w+)'
    match = re.findall(pattern=pattern,string=string)
    return match[0]

In [5]:
string1 ='/home/data/sample/L15-0506E-1204N_2027_3374_13/UDM_masks/global_monthly_2019_11_mosaic_L15-0506E-1204N_2027_3374_13_UDM.tif'
string2 = '/homedata//sample/L15-0506E-1204N_2027_3374_13/images/global_monthly_2019_01_mosaic_L15-0506E-1204N_2027_3374_13.tif'
stst='/home/data/train/L15-0760E-0887N_3041_4643_13/labels_match_pix/global_monthly_2019_08_mosaic_L15-0760E-0887N_3041_4643_13_Buildings.geojson'

In [6]:
extract_metadata_from_string(stst)

('train',
 'L15-0760E-0887N_3041_4643_13',
 'labels_match_pix',
 'global_monthly_2019_08_mosaic_L15-0760E-0887N_3041_4643_13',
 '2019',
 '08',
 'Buildings',
 'geojson')

In [7]:
def extract_list_of_paths(directory):
    paths_list = [path for path in Path.glob(directory,pattern = '**/*.*')]
    return paths_list

In [8]:
""" def extract_metadata_from_list_of_paths(list_of_paths):
    d_keys = ['parent_dir','image_dir_name','sub_dir_name','fname','year','month','data_type','extension']
    d = {key:[] for key in d_keys}
    d['full_path'] = []
    for path in list_of_paths:
        print(path)
        print(str(path))
        metadata = extract_metadata_from_string(str(path))
        if metadata == 'DS':
            continue
        else:
            d['full_path'].append(path)
            
            for i,data in enumerate(metadata):
                d[d_keys[i]].append(data)
        return d """

def extract_metadata_from_list_of_paths(list_of_paths):
    d_keys = ['parent_dir','image_dir_name','sub_dir_name','fname','year','month','data_type','extension']
    d = {key:[] for key in d_keys}
    d['full_path'] = []
    for path in list_of_paths:
        metadata = extract_metadata_from_string(str(path))
        d['full_path'].append(path)
        
        for i,data in enumerate(metadata):
            d[d_keys[i]].append(data)
    return d

#### Extracting the Metadata

In [9]:
train_paths = extract_list_of_paths(directory=train_dir)
test_paths = extract_list_of_paths(directory=test_dir)
sample_paths = extract_list_of_paths(directory=sample_dir)

In [11]:
train_metadata_dict = extract_metadata_from_list_of_paths(train_paths)
test_metadata_dict = extract_metadata_from_list_of_paths(test_paths)
sample_metadata_dict = extract_metadata_from_list_of_paths(sample_paths)

In [12]:
df_train = pd.DataFrame(train_metadata_dict)
df_test = pd.DataFrame(test_metadata_dict)
df_sample = pd.DataFrame(sample_metadata_dict)

In [13]:
df_train

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_01_mosaic_L15-0331E-1257N_...,2018,01,,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
1,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_02_mosaic_L15-0331E-1257N_...,2018,02,,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
2,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_03_mosaic_L15-0331E-1257N_...,2018,03,,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
3,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_04_mosaic_L15-0331E-1257N_...,2018,04,,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
4,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_05_mosaic_L15-0331E-1257N_...,2018,05,,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
...,...,...,...,...,...,...,...,...,...
8734,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_09_mosaic_L15-1848E-0793N_...,2019,09,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8735,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_10_mosaic_L15-1848E-0793N_...,2019,10,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8736,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_11_mosaic_L15-1848E-0793N_...,2019,11,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8737,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_12_mosaic_L15-1848E-0793N_...,2019,12,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...


In [14]:
df_test

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_02_mosaic_L15-0369E-1244N_...,2018,02,,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
1,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_03_mosaic_L15-0369E-1244N_...,2018,03,,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
2,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_04_mosaic_L15-0369E-1244N_...,2018,04,,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
3,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_05_mosaic_L15-0369E-1244N_...,2018,05,,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
4,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_06_mosaic_L15-0369E-1244N_...,2018,06,,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
...,...,...,...,...,...,...,...,...,...
461,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_06_mosaic_L15-1749E-1266N_...,2019,06,,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
462,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_07_mosaic_L15-1749E-1266N_...,2019,07,,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
463,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_08_mosaic_L15-1749E-1266N_...,2019,08,,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
464,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_09_mosaic_L15-1749E-1266N_...,2019,09,,tif,/home/data/test_public/L15-1749E-1266N_6997_31...


In [15]:
df_sample

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,sample,L15-0506E-1204N_2027_3374_13,UDM_masks,global_monthly_2018_06_mosaic_L15-0506E-1204N_...,2018,06,UDM,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
1,sample,L15-0506E-1204N_2027_3374_13,UDM_masks,global_monthly_2019_11_mosaic_L15-0506E-1204N_...,2019,11,UDM,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
2,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_01_mosaic_L15-0506E-1204N_...,2018,01,,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
3,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_02_mosaic_L15-0506E-1204N_...,2018,02,,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
4,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_03_mosaic_L15-0506E-1204N_...,2018,03,,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
...,...,...,...,...,...,...,...,...,...
141,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_08_mosaic_L15-0506E-1204N_...,2019,08,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
142,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_09_mosaic_L15-0506E-1204N_...,2019,09,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
143,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_10_mosaic_L15-0506E-1204N_...,2019,10,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
144,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_11_mosaic_L15-0506E-1204N_...,2019,11,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...


In [16]:
df_train[df_train['data_type'] == 'Buildings']['extension'].value_counts()

geojson    4269
Name: extension, dtype: int64

In [17]:
df_train[df_train['data_type'] == 'UDM']['extension'].value_counts()

geojson    1423
tif         201
Name: extension, dtype: int64

In [18]:
df_train[df_train['data_type'] == '']['extension'].value_counts()

tif    2846
Name: extension, dtype: int64

In [19]:
df_train.loc[df_train['data_type'] =='','data_type'] = 'Images'
df_test.loc[df_test['data_type'] =='','data_type'] = 'Images'
df_sample.loc[df_sample['data_type'] =='','data_type'] = 'Images'

In [20]:
df_train

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_01_mosaic_L15-0331E-1257N_...,2018,01,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
1,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_02_mosaic_L15-0331E-1257N_...,2018,02,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
2,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_03_mosaic_L15-0331E-1257N_...,2018,03,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
3,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_04_mosaic_L15-0331E-1257N_...,2018,04,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
4,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_05_mosaic_L15-0331E-1257N_...,2018,05,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...
...,...,...,...,...,...,...,...,...,...
8734,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_09_mosaic_L15-1848E-0793N_...,2019,09,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8735,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_10_mosaic_L15-1848E-0793N_...,2019,10,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8736,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_11_mosaic_L15-1848E-0793N_...,2019,11,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...
8737,train,L15-1848E-0793N_7394_5018_13,labels_match_pix,global_monthly_2019_12_mosaic_L15-1848E-0793N_...,2019,12,Buildings,geojson,/home/data/train/L15-1848E-0793N_7394_5018_13/...


In [21]:
df_test

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_02_mosaic_L15-0369E-1244N_...,2018,02,Images,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
1,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_03_mosaic_L15-0369E-1244N_...,2018,03,Images,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
2,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_04_mosaic_L15-0369E-1244N_...,2018,04,Images,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
3,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_05_mosaic_L15-0369E-1244N_...,2018,05,Images,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
4,test_public,L15-0369E-1244N_1479_3214_13,images_masked,global_monthly_2018_06_mosaic_L15-0369E-1244N_...,2018,06,Images,tif,/home/data/test_public/L15-0369E-1244N_1479_32...
...,...,...,...,...,...,...,...,...,...
461,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_06_mosaic_L15-1749E-1266N_...,2019,06,Images,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
462,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_07_mosaic_L15-1749E-1266N_...,2019,07,Images,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
463,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_08_mosaic_L15-1749E-1266N_...,2019,08,Images,tif,/home/data/test_public/L15-1749E-1266N_6997_31...
464,test_public,L15-1749E-1266N_6997_3126_13,images_masked,global_monthly_2019_09_mosaic_L15-1749E-1266N_...,2019,09,Images,tif,/home/data/test_public/L15-1749E-1266N_6997_31...


In [22]:
df_sample

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path
0,sample,L15-0506E-1204N_2027_3374_13,UDM_masks,global_monthly_2018_06_mosaic_L15-0506E-1204N_...,2018,06,UDM,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
1,sample,L15-0506E-1204N_2027_3374_13,UDM_masks,global_monthly_2019_11_mosaic_L15-0506E-1204N_...,2019,11,UDM,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
2,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_01_mosaic_L15-0506E-1204N_...,2018,01,Images,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
3,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_02_mosaic_L15-0506E-1204N_...,2018,02,Images,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
4,sample,L15-0506E-1204N_2027_3374_13,images,global_monthly_2018_03_mosaic_L15-0506E-1204N_...,2018,03,Images,tif,/home/data/sample/L15-0506E-1204N_2027_3374_13...
...,...,...,...,...,...,...,...,...,...
141,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_08_mosaic_L15-0506E-1204N_...,2019,08,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
142,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_09_mosaic_L15-0506E-1204N_...,2019,09,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
143,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_10_mosaic_L15-0506E-1204N_...,2019,10,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...
144,sample,L15-0506E-1204N_2027_3374_13,labels_match_pix,global_monthly_2019_11_mosaic_L15-0506E-1204N_...,2019,11,Buildings,geojson,/home/data/sample/L15-0506E-1204N_2027_3374_13...


In [23]:
def get_metadata(input_dir):
    list_of_paths = extract_list_of_paths(input_dir)
    metadata_dict = extract_metadata_from_list_of_paths(list_of_paths)
    df = pd.DataFrame(metadata_dict)
    
    df.loc[df['data_type'] =='','data_type'] = 'Images'
    

    # Identify Images that have UDM Masks
    condition = (df['sub_dir_name'] == 'UDM_masks')
    # Get the indices of the images that have udm
    udm_indices = df.loc[condition].index
    # Get list of unique file names that have UDMs
    udm_fnames = list(df.loc[udm_indices,'fname'])
    # Get all rows that match the file names
    udm_mask = df['fname'].progress_map(lambda x: x in udm_fnames)
    # Initialize has_udm column 
    df['has_udm'] = False
    # Apply mask and update udm value
    df.loc[udm_mask,'has_udm'] = True

    return df

#### Saving the Outputs

In [24]:
df_train = get_metadata(train_dir)
df_test = get_metadata(test_dir)
df_sample = get_metadata(sample_dir)
df_concat = pd.concat([df_train,df_test,df_sample]).reset_index()

  0%|          | 0/8739 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

In [25]:
df_train.to_csv(output_csv_path/'df_train.csv',index=False)
df_test.to_csv(output_csv_path/'df_test.csv',index=False)
df_sample.to_csv(output_csv_path/'df_sample.csv',index=False)
df_concat.to_csv(output_csv_path/'df_concat.csv',index=False)

In [26]:
df_train.head()

Unnamed: 0,parent_dir,image_dir_name,sub_dir_name,fname,year,month,data_type,extension,full_path,has_udm
0,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_01_mosaic_L15-0331E-1257N_...,2018,1,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...,False
1,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_02_mosaic_L15-0331E-1257N_...,2018,2,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...,False
2,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_03_mosaic_L15-0331E-1257N_...,2018,3,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...,False
3,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_04_mosaic_L15-0331E-1257N_...,2018,4,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...,False
4,train,L15-0331E-1257N_1327_3160_13,images,global_monthly_2018_05_mosaic_L15-0331E-1257N_...,2018,5,Images,tif,/home/data/train/L15-0331E-1257N_1327_3160_13/...,False


In [27]:
!ls ../data/output_csvs

df_concat.csv  df_test.csv   global_geodataframe.gpkg
df_sample.csv  df_train.csv  sample_geodataframe.gpkg


In [28]:
def untidy_df(df):
    
    parent_dir = df['parent_dir']
    im_dir_name = df['image_dir_name']
    fname = df['fname']
    year = df['year']
    month = df['month']
    has_udm = df['has_udm']
    
    images_masked = im_dir_name + '/images_masked/' + fname + '.tif'
    
    if parent_dir == 'test_public':
            images = None
            labels_buildings = None
            labels_udm = None
            labels_match = None
            labels_match_pix = None
            udm_masks = None
    else:
        if has_udm:
            udm_masks = im_dir_name + '/UDM_masks/' + fname + '.tif'
        else:
            udm_masks = None


        images = im_dir_name + '/images/' + fname + '.tif'
        labels_buildings = im_dir_name + '/labels/' + fname + '_Buildings.geojson'
        labels_udm = im_dir_name + '/labels/' + fname + '_UDM.geojson'
        labels_match = im_dir_name + '/labels_match/' + fname + '_Buildings.geojson'
        labels_match_pix = im_dir_name + '/labels_match_pix/' + fname + '_Buildings.geojson'

    keys = ['parent_dir','image_dir_name','fname','year','month','has_udm','udm_masks','images','images_masked','labels_buildings','labels_udm','labels_match','labels_match_pix']
    values = [parent_dir,im_dir_name,fname,year,month,has_udm,udm_masks,images,images_masked,labels_buildings,labels_udm,labels_match,labels_match_pix]
    
    return {k:v for (k,v) in zip(keys,values)}

In [29]:
def get_untidy_frame(df):
    # apply function on input dataframe
    list_of_dicts = df.progress_apply(lambda x: untidy_df(x),axis=1)
    # drop the duplicated columns
    untidy_frame = pd.DataFrame.from_records(list_of_dicts).drop_duplicates()
    # bask in all the glory of your untidy frame ;D
    return untidy_frame

In [30]:
test_untidy_df = get_untidy_frame(df_test)
train_untidy_df = get_untidy_frame(df_train)
sample_untidy_df = get_untidy_frame(df_sample)
concat_untidy_df = get_untidy_frame(df_concat)

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/8739 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

  0%|          | 0/9351 [00:00<?, ?it/s]

In [31]:
concat_untidy_df

Unnamed: 0,parent_dir,image_dir_name,fname,year,month,has_udm,udm_masks,images,images_masked,labels_buildings,labels_udm,labels_match,labels_match_pix
0,train,L15-0331E-1257N_1327_3160_13,global_monthly_2018_01_mosaic_L15-0331E-1257N_...,2018,01,False,,L15-0331E-1257N_1327_3160_13/images/global_mon...,L15-0331E-1257N_1327_3160_13/images_masked/glo...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels_match/glob...,L15-0331E-1257N_1327_3160_13/labels_match_pix/...
1,train,L15-0331E-1257N_1327_3160_13,global_monthly_2018_02_mosaic_L15-0331E-1257N_...,2018,02,False,,L15-0331E-1257N_1327_3160_13/images/global_mon...,L15-0331E-1257N_1327_3160_13/images_masked/glo...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels_match/glob...,L15-0331E-1257N_1327_3160_13/labels_match_pix/...
2,train,L15-0331E-1257N_1327_3160_13,global_monthly_2018_03_mosaic_L15-0331E-1257N_...,2018,03,False,,L15-0331E-1257N_1327_3160_13/images/global_mon...,L15-0331E-1257N_1327_3160_13/images_masked/glo...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels_match/glob...,L15-0331E-1257N_1327_3160_13/labels_match_pix/...
3,train,L15-0331E-1257N_1327_3160_13,global_monthly_2018_04_mosaic_L15-0331E-1257N_...,2018,04,False,,L15-0331E-1257N_1327_3160_13/images/global_mon...,L15-0331E-1257N_1327_3160_13/images_masked/glo...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels_match/glob...,L15-0331E-1257N_1327_3160_13/labels_match_pix/...
4,train,L15-0331E-1257N_1327_3160_13,global_monthly_2018_05_mosaic_L15-0331E-1257N_...,2018,05,False,,L15-0331E-1257N_1327_3160_13/images/global_mon...,L15-0331E-1257N_1327_3160_13/images_masked/glo...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels/global_mon...,L15-0331E-1257N_1327_3160_13/labels_match/glob...,L15-0331E-1257N_1327_3160_13/labels_match_pix/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,sample,L15-0506E-1204N_2027_3374_13,global_monthly_2019_07_mosaic_L15-0506E-1204N_...,2019,07,False,,L15-0506E-1204N_2027_3374_13/images/global_mon...,L15-0506E-1204N_2027_3374_13/images_masked/glo...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels_match/glob...,L15-0506E-1204N_2027_3374_13/labels_match_pix/...
9226,sample,L15-0506E-1204N_2027_3374_13,global_monthly_2019_08_mosaic_L15-0506E-1204N_...,2019,08,False,,L15-0506E-1204N_2027_3374_13/images/global_mon...,L15-0506E-1204N_2027_3374_13/images_masked/glo...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels_match/glob...,L15-0506E-1204N_2027_3374_13/labels_match_pix/...
9227,sample,L15-0506E-1204N_2027_3374_13,global_monthly_2019_09_mosaic_L15-0506E-1204N_...,2019,09,False,,L15-0506E-1204N_2027_3374_13/images/global_mon...,L15-0506E-1204N_2027_3374_13/images_masked/glo...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels_match/glob...,L15-0506E-1204N_2027_3374_13/labels_match_pix/...
9228,sample,L15-0506E-1204N_2027_3374_13,global_monthly_2019_10_mosaic_L15-0506E-1204N_...,2019,10,False,,L15-0506E-1204N_2027_3374_13/images/global_mon...,L15-0506E-1204N_2027_3374_13/images_masked/glo...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels/global_mon...,L15-0506E-1204N_2027_3374_13/labels_match/glob...,L15-0506E-1204N_2027_3374_13/labels_match_pix/...


The above csv format will make it easier for us to create our custom pytorch dataset class, notice how you have access to whichever image or geojson file that you want, and how they are all grouped by the corresponding month and year.

### Finally we save the untidy dataframes

In [32]:
train_untidy_df.to_csv(output_csv_path/'df_train_untidy.csv',index=False)
test_untidy_df.to_csv(output_csv_path/'df_test_untidy.csv',index=False)
sample_untidy_df.to_csv(output_csv_path/'df_sample_untidy.csv',index=False)
concat_untidy_df.to_csv(output_csv_path/'df_concat_untidy.csv',index=False)

In [33]:
!ls ../data/output_csvs

df_concat.csv	      df_test.csv	   global_geodataframe.gpkg
df_concat_untidy.csv  df_test_untidy.csv   sample_geodataframe.gpkg
df_sample.csv	      df_train.csv
df_sample_untidy.csv  df_train_untidy.csv
