In [1]:
from clearml import Task, StorageManager, Dataset
import sys, json, os, jsonlines, ipdb

In [13]:
class dataset_git:
    @staticmethod
    def create_root(dataset_name, dataset_project):
        dataset = Dataset.create(dataset_name=dataset_name, dataset_project=dataset_project, dataset_tags=["original"], parent_datasets=None, use_current_task=False)
        default_storage = dataset.get_default_storage()
        dataset.add_files("upload", wildcard="*.jsonl", local_base_folder=".", dataset_path=".", recursive=True, verbose=False)
        dataset.upload(output_url=default_storage)
        dataset.finalize()
        dataset.publish()

    def __init__(self, pull_dataset_project:str):
        self.pull_dataset_project = pull_dataset_project
        self.version_ids = [version["id"] for version in Dataset.list_datasets([self.pull_dataset_project], only_completed=True)]            
        self.latest_dataset = Dataset.get(dataset_id=self.version_ids[-1])
        self.default_storage = self.latest_dataset.get_default_storage()
        
    def get_current_version_id(self):
        return self.latest_dataset.id
    
    def get_parent(self):
        dependency = self.latest_dataset.get_dependency_graph()
        return dependency[self.latest_dataset.id]

    def get_latest_dataset(self, target_folder_path):
        '''
        returns a local path of a dataset copy
        '''
        return self.latest_dataset.get_mutable_local_copy(target_folder_path)
    
    def sync_folder2dataset(self, local_data_path:str):
        new_dataset = Dataset.create(dataset_name="child_of_"+self.latest_dataset.id, dataset_project=self.pull_dataset_project, parent_datasets=[self.latest_dataset.id])
        modifications=self.latest_dataset.verify_dataset_hash(local_copy_path=local_data_path)
        print("Files modified: {}".format(modifications))
        if len(modifications)>0:
            print("Syncing files...")
            new_dataset.sync_folder(local_data_path, verbose=False)
            new_dataset.upload(output_url=self.default_storage)
            new_dataset.finalize()
            self.latest_dataset = new_dataset
            self.version_ids = [version["id"] for version in Dataset.list_datasets([self.pull_dataset_project], only_completed=True)]
            print("Success! Files synced and updated.")
        else:
            print("No files to update")
                           

In [14]:
git = dataset_git("datasets/test")

In [4]:
git.get_current_version_id()

'31ea90118187499a9c6c61a3e9a7bc1d'

In [5]:
git.get_parent()

[]

In [6]:
git.version_ids

['31ea90118187499a9c6c61a3e9a7bc1d']

In [9]:
folder_path = git.get_latest_dataset("./temp")

def load_jsonl(load_path:str):
    data = []
    with open(load_path, 'r') as file:
        for doc in file:
            data.append(json.loads(doc))
    return data

def to_jsonl(filename:str, file_obj):
    resultfile = open(filename, 'wb')
    writer = jsonlines.Writer(resultfile)
    writer.write_all(file_obj) 

train_file = os.path.join(folder_path, "data/upload", "train.jsonl")
train_data = load_jsonl(train_file)[:5]
to_jsonl(train_file, train_data)

In [15]:
git.sync_folder2dataset("temp")

Files modified: ['data/upload/train.jsonl']
Syncing files...
Generating SHA2 hash for 3 files


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 7045.30it/s]

Hash generation completed
Uploading compressed dataset changes (1 files, total 9.09 KB) to http://experiment.sytes.net:8081





Upload completed (9.09 KB)
2021-11-18 02:36:34,248 - clearml.Task - INFO - Waiting to finish uploads
2021-11-18 02:36:34,285 - clearml.Task - INFO - Finished uploading
Success! Files synced and updated.
