In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from GLC.data_loading.common import load_patch

In [14]:
# Load Training Data

DATA_PATH = Path("/input/")

df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")

df_obs = pd.concat((df_obs_fr, df_obs_us))
df_obs = df_obs[df_obs['species_id'].isin(df_obs['species_id'].value_counts()[:500].index.tolist())]
df_obs = df_obs.sample(n=100000)
# df_obs = df_obs.head()

print("Number of observations for training: {}".format(len(df_obs)))

# Relabel

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df_obs.species_id)
df_obs['species_id'] = le.transform(df_obs.species_id)
number_of_unique_species = np.unique(df_obs['species_id']).shape[0]
print("Number of unique species: "+str(number_of_unique_species))

Number of observations for training: 100000
Number of unique species: 500


In [15]:
## Save dataset
df_obs.to_csv(DATA_PATH / 'data-subset' / 'data2100000.csv')

In [16]:
## Save patches
patches = []
i=0
for row in range(len(df_obs)):
    patch = load_patch(df_obs.iloc[row].name, DATA_PATH)
    temp = np.dstack((patch[0], np.zeros(patch[1].shape)))
    temp[:,:,3] = patch[1]
    temp = np.dstack((temp, np.zeros(patch[2].shape)))
    temp[:,:,4] = patch[2]
    temp = np.dstack((temp, np.zeros(patch[3].shape)))
    temp[:,:,5] = patch[3]
    patches.append(temp)
    if (len(patches)%1000==0):
        print(str(i)+" done")
        patches = np.array(patches)
        path = 'data-subset/patches/patches1002000-'+str(i)
        np.savez_compressed(DATA_PATH/path,patches)
        i+=1
        patches=[]


0 done
1 done


KeyboardInterrupt: 