# Before we start, you need to know that

* You will need to download the datasets in order to run this notebook. 
* Raw data are in npy format.
* We impose restrictions on training time and resources to push the state-of-the-art further.

# Part 1: Dataset Overview

In [4]:
from os.path import join

%load_ext autoreload
%autoreload 2

dataset = 'deepsea'
#dataset = 'navierstokes'
#dataset = 'crypto'

# copy simple model
baseline_dir = 'simple_baseline_models/'
test_dir = 'test_model'
inges_dir = 'ingestion/'
score_dir = 'scoring/'  

from sys import path
path.append(test_dir); path.append(inges_dir); path.append(score_dir); path.append(baseline_dir);

model_simple = join(baseline_dir, 'decathlon_linear', 'model.py') # choose one simple baseline model
model_submit = join(test_dir, 'model.py') # submitted models must be called model.py
!mkdir -p $test_dir
!cp $model_simple $model_submit

In [2]:
from dev_datasets import DecathlonDataset, extract_metadata

train_dataset = DecathlonDataset(dataset, './dev_public', 'train')
test_dataset = DecathlonDataset(dataset, './dev_public', 'test')

md_train = extract_metadata(train_dataset)
md_test = extract_metadata(test_dataset)
print ("Dataset path: ", md_train.get_dataset_name())
print ("Input shape: ",  md_train.get_tensor_shape())
print ("Output shape:", md_train.get_output_shape())
print ("Dataset size: ",  md_train.size())

Dataset path:  deepsea
Input shape:  (1, 4, 1000, 1)
Output shape: (36,)
Dataset size:  59394


In [3]:
'''
Quick test of get_solution from score.py
'''

from score import get_solution

solution = get_solution("dev_public", dataset)
print(solution.shape)
print(solution)



2022-07-11 16:33:41,212 INFO score.py: solution shape=(14849, 36)
(14849, 36)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 1 0 0]]


In [4]:
from torch.utils.data import DataLoader

def get_dataloader(dataset, batch_size, split):
    """Get the PyTorch dataloader.
    Args:
        dataset:
        batch_size : batch_size for training set

    Return:
        dataloader: PyTorch Dataloader
    """
    if split == "train":
        dataloader = DataLoader(
            dataset,
            dataset.required_batch_size or batch_size,
            shuffle=True,
            drop_last=False,
            collate_fn=dataset.collate_fn,
        )
    elif split == "test":
        dataloader = DataLoader(
            dataset,
            dataset.required_batch_size or batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
        )
    return dataloader


batch_size = 1
train_loader = get_dataloader(train_dataset, batch_size, 'train')
test_loader = get_dataloader(test_dataset, batch_size, 'test')

In [5]:
labels = []
for x, y in test_loader:
    if len(labels) < 10:
        print(x.shape, y.shape)
    label = y.tolist()
    labels += label

torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])
torch.Size([1, 1, 4, 1000, 1]) torch.Size([1, 36])


# Part 2: Model, Prediction and Metrics

In [6]:
%load_ext autoreload
%autoreload 2

# set time budget and instanciate the model with dataset
from model import Model
M = Model(md_train)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Device Found =  cuda 
Moving Model and Data into the device...


INPUT SHAPE =  (4, 1, 1000, 1)
input_shape, fc_size (4, 1, 1000, 1) 4000

PyModel Defined

TorchModel(
  (fc): Linear(in_features=4000, out_features=36, bias=True)
)


In [7]:
# train the model for a certain time
time_budget = 200
M.train(train_dataset, remaining_time_budget=time_budget)

2022-07-11 16:33:57,759 INFO model.py: Begin training for another 100 steps...
2022-07-11 16:33:57,874 INFO model.py: 100 steps trained. 0.11 sec used. Now total steps trained: 100. Total time used for training: 0.11 sec. Current estimated time per step: 1.13e-03 sec.


In [8]:
# get prediction by calling test method
prediction = M.test(test_dataset, remaining_time_budget=time_budget)
print(prediction.shape)
print(prediction[0])

2022-07-11 16:34:00,233 INFO model.py: Model already trained for 0.05051015254066067 epochs.
2022-07-11 16:34:00,233 INFO model.py: Begin testing...
2022-07-11 16:34:00,511 INFO model.py: [+] Successfully made one prediction. 0.28 sec used. Total time used for testing: 0.28 sec. Current estimated time for test: 2.78e-01 sec.
(14849, 36)
[3.0557723e-03 1.2718810e-02 4.6302556e-04 3.7274160e-04 2.6033684e-03
 1.5464555e-02 1.7152583e-02 5.4378547e-02 2.6468853e-03 2.8501680e-02
 3.1255908e-02 7.3938828e-04 2.0707171e-02 2.2594687e-02 2.8324619e-02
 1.4311513e-03 1.0409473e-02 2.1078214e-03 4.9858405e-03 3.8973498e-04
 4.1606752e-04 1.2523745e-02 7.0136213e-03 8.8643851e-03 6.7480560e-04
 2.7334378e-03 8.4448206e-05 2.5742915e-03 5.7230489e-03 1.4331472e-02
 2.9606645e-03 5.0161537e-02 3.9330753e-03 2.5501000e-02 4.2292615e-03
 7.6331752e-03]


In [9]:
from score import decathlon_scorer

score = decathlon_scorer(solution, prediction, dataset)
print ("Score: ", score)

Score:  0.4418436891040296


# Part 3: Test and Submission

<font color='red'>
    It is important that you test your submission files before submitting them. All you have to do to make a submission is modify the file <code>model.py</code> in the <code>test_model/</code> directory, then run this test to make sure everything works fine. This is the actual program that will be run on the server to test your submission. 

In [1]:
# run local test

!python run_local_test.py --code_dir=./test_model --dataset_dir=./dev_public --time_budget=10

2022-07-11 17:09:48 INFO run_local_test.py: ##################################################
2022-07-11 17:09:48 INFO run_local_test.py: Begin running local test using
2022-07-11 17:09:48 INFO run_local_test.py: code_dir = test_model
2022-07-11 17:09:48 INFO run_local_test.py: dataset_dir = dev_public
2022-07-11 17:09:48 INFO run_local_test.py: ##################################################
2022-07-11 17:09:48 INFO run_local_test.py: Cleaning existing output directory of last run: /app/codalab/github.com/cxxz/automl_decathlon_starting_kit/sample_result_submission
2022-07-11 17:09:48 INFO run_local_test.py: Cleaning existing output directory of last run: /app/codalab/github.com/cxxz/automl_decathlon_starting_kit/scoring_output
2022-07-11 17:09:49,750 DEBUG ingestion.py: Parsed args are: Namespace(code_dir='./test_model', dataset_dir='./dev_public', ingestion_program_dir='/app/codalab/github.com/cxxz/automl_decathlon_starting_kit/ingestion', output_dir='/app/codalab/github.com/cxxz

In [5]:
# compress model to be submitted
from data_io import zipdir

submission_filename = 'mysubmission.zip'
zipdir(submission_filename, test_dir)
print("Submit this file: " + submission_filename)

Submit this file: mysubmission.zip


