# Taking a quick look on the data

The objective is to quickly gather a series of information on the data we are looking at

In [17]:
import os
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
from anytree import Node, RenderTree
from typing import Dict, List

What are the shapes of the data we are working? 
- We are assuming (30000, 31) for train data and (300, 1259) for target data
- Since test dataset is not labeled we will most likely use a test set and dev set taken from the training set. Selected data should be taken with a specific sampling technique that we will discuss later

In [None]:
training_dataset = './data/train_data/*'
test_dataset = './data/test_data/*'


# Access training ids
training_paths = glob(training_dataset)
training_ids = [path.split('/')[-1] for path in training_paths]
training_ids = [path.split('\\')[-1] for path in training_ids]

# Access test ids
test_paths = glob(test_dataset)
test_ids = [path.split('/')[-1] for path in test_paths]
test_ids = [path.split('\\')[-1] for path in test_ids]

print(f'We are working with {len(training_ids)} training examples')
print(f'We are working with {len(test_ids)} test examples')



We are working with 2000 training examples
We are working with 150 test examples


Let us take a look on what is inside each id

In [43]:
training_sample = os.listdir(training_paths[0])
training_sample

['receiver_data_src_1.npy',
 'receiver_data_src_150.npy',
 'receiver_data_src_225.npy',
 'receiver_data_src_300.npy',
 'receiver_data_src_75.npy',
 'vp_model.npy']

--- 
Let us confirm all data has the appropiate shape

In [None]:
for i in range(len(training_paths)):
    assert np.load(os.path.join(training_paths[i], f'receiver_data_src_1.npy')).shape == (10001, 31)
    assert np.load(os.path.join(training_paths[i], f'receiver_data_src_75.npy')).shape == (10001, 31)
    assert np.load(os.path.join(training_paths[i], f'receiver_data_src_150.npy')).shape == (10001, 31)
    assert np.load(os.path.join(training_paths[i], f'receiver_data_src_225.npy')).shape == (10001, 31)
    assert np.load(os.path.join(training_paths[i], f'receiver_data_src_300.npy')).shape == (10001, 31)
    assert np.load(os.path.join(training_paths[i], f'vp_model.npy')).shape == (300, 1259)

Receiver data shape: (10001, 31)
Target data shape: (300, 1259)


---
Let us find out the data type of every file


In [55]:
training_sample = np.load(os.path.join(training_paths[0], f'receiver_data_src_1.npy'))
target_sample = np.load(os.path.join(training_paths[0], f'vp_model.npy'))
print(f'The dtype of a training instance is: {training_sample.dtype}')
print(f'The dtype of a training label is: {target_sample.dtype}')



The dtype of a training instance is: float32
The dtype of a training label is: float64


Training instances are single float precision, while labels are expected to be double float precision

---
Before comencing exploratory data analysis we must decide strategy on train/test split <br>
While ThinkOnward provides test and train data I will try to engage in a different strategy that will allow us a much easier understanding of the model performance 