# YAML Database

Date: 2024/03/27

複数のファイルへファイル単位でNLPなど行い、その結果を、YAMLへ集約したい。
NLPの処理は結構時間かかるので、私の仕事では、大量のファイルを処理すると10分以上かかる。
新規作成や更新されたファイルのみ処理したい。

## Dummy file 生成

In [1]:
import random

with open('tmp/test1.txt', 'w') as f:
    f.write(str(random.random()))
with open('tmp/test1.txt', 'w') as f:
    f.write(str(random.random()))
with open('tmp/test1.txt', 'w') as f:
    f.write(str(random.random()))
with open('tmp/test1.txt', 'w') as f:
    f.write(str(random.random()))

In [2]:
import glob
import os

paths = glob.glob('tmp/test*.txt')
# 作成時間
[os.path.getctime(path) for path in paths]

[1711536435.0841026,
 1711536448.0236292,
 1711502152.1057749,
 1711502149.0519512]

## yaml_db クラス

In [3]:
import os
import yaml

class yaml_db:

    # (C)RUD
    def __init__(self, db_path):
        self.db_path = db_path
        if os.path.exists(db_path):
            with open(db_path, 'r', encoding='utf-8') as f:
                self.db = yaml.safe_load(f)
        else:
            self.db = {}
            with open(db_path, 'w', encoding='utf-8') as f:
                f.write(yaml.dump(db))

    def check(self, path):
        return path not in self.db.keys() or self.db[path]['timestamp'] != os.path.getmtime(path)
        
    # CR(U)D
    def update(self, path, content):
        if os.path.exists(path) and ((path not in self.db.keys()) or (self.db[path]['timestamp'] != os.path.getmtime(path))):
            self.db[path] = {'timestamp': os.path.getmtime(path) ,'content': content}
            return True
        else:
            return False

    # C(R)UD
    def read(self, path):
        if path in self.db.keys():
            return self.db[path]
        else:
            False

    def read_all(self):
        db_ = {}
        for path in self.db.keys():
            db_[path] = self.db[path]['content']
        return db_

    def delete(self, path):
        if path in self.db.keys():
            del self.db[path]
            return True
        else:
            return False

    def delete_all(self):
        self.db = {}
        return True
        
    def commit(self):
        try:
            with open(self.db_path, 'w', encoding='utf-8') as f:
                f.write(yaml.dump(self.db))
                return True
        except:
            return False

## テスト

In [4]:
db = yaml_db('tmp/db.yaml')
db.delete_all()

True

In [5]:
db.check('tmp/test1.txt')

True

In [6]:
r1 = db.update('tmp/test1.txt', str(random.random()))
r2 = db.update('tmp/test2.txt', str(random.random()))
r3 = db.update('tmp/test3.txt', str(random.random()))
r4 = db.update('tmp/test4.txt', str(random.random()))
[r1, r2, r3, r4]

[True, True, True, True]

In [7]:
import time
time.sleep(3)

with open('tmp/test1.txt', 'w') as f:
    f.write(str(random.random()))

with open('tmp/test4.txt', 'w') as f:
    f.write(str(random.random()))

In [8]:
r1 = db.update('tmp/test1.txt', str(random.random()))
r2 = db.update('tmp/test2.txt', str(random.random()))
r3 = db.update('tmp/test3.txt', str(random.random()))
r4 = db.update('tmp/test4.txt', str(random.random()))
[r1, r2, r3, r4]

[True, False, False, True]

In [9]:
db.check('tmp/test1.txt')

False

In [10]:
db.update('tmp/test5.txt', str(random.random()))

False

In [11]:
db.read('tmp/test1.txt')

{'timestamp': 1711536451.086429, 'content': '0.07870997096548338'}

In [12]:
db.read_all()

{'tmp/test1.txt': '0.07870997096548338',
 'tmp/test2.txt': '0.0930992549601517',
 'tmp/test3.txt': '0.34313957275316975',
 'tmp/test4.txt': '0.7783836999858296'}

In [13]:
db.commit()

True