Skip to content

Commit

Permalink
feat: allow disabling storage persistence (#1539)
Browse files Browse the repository at this point in the history
  • Loading branch information
vladfrangu committed Sep 15, 2022
1 parent e3cb509 commit f65e3c6
Show file tree
Hide file tree
Showing 13 changed files with 128 additions and 23 deletions.
25 changes: 21 additions & 4 deletions packages/core/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,17 @@ export interface ConfigurationOptions {
/**
* Sets the log level to the given value.
*
* Alternative to `CRAWLEE_LOG_LEVEL`.
* Alternative to `CRAWLEE_LOG_LEVEL` environment variable.
* @default 'INFO'
*/
logLevel?: LogLevel | LogLevel[keyof LogLevel];

/**
* Defines whether the storage client used should persist the data it stores.
*
* Alternative to `CRAWLEE_PERSIST_STORAGE` environment variable.
*/
persistStorage?: boolean;
}

/**
Expand Down Expand Up @@ -203,7 +210,9 @@ export interface ConfigurationOptions {
* `defaultDatasetId` | `CRAWLEE_DEFAULT_DATASET_ID` | `'default'`
* `defaultKeyValueStoreId` | `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` | `'default'`
* `defaultRequestQueueId` | `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID` | `'default'`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60e3`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60_000`
* `purgeOnStart` | `CRAWLEE_PURGE_ON_START` | `true`
* `persistStorage` | `CRAWLEE_PERSIST_STORAGE` | `true`
*
* ## Advanced Configuration Options
*
Expand All @@ -213,6 +222,8 @@ export interface ConfigurationOptions {
* `xvfb` | `CRAWLEE_XVFB` | -
* `chromeExecutablePath` | `CRAWLEE_CHROME_EXECUTABLE_PATH` | -
* `defaultBrowserPath` | `CRAWLEE_DEFAULT_BROWSER_PATH` | -
* `disableBrowserSandbox` | `CRAWLEE_DISABLE_BROWSER_SANDBOX` | -
* `availableMemoryRatio` | `CRAWLEE_AVAILABLE_MEMORY_RATIO` | `0.25`
*/
export class Configuration {
/**
Expand All @@ -233,9 +244,10 @@ export class Configuration {
CRAWLEE_DEFAULT_BROWSER_PATH: 'defaultBrowserPath',
CRAWLEE_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
CRAWLEE_LOG_LEVEL: 'logLevel',
CRAWLEE_PERSIST_STORAGE: 'persistStorage',
};

protected static BOOLEAN_VARS = ['purgeOnStart', 'headless', 'xvfb', 'disableBrowserSandbox'];
protected static BOOLEAN_VARS = ['purgeOnStart', 'headless', 'xvfb', 'disableBrowserSandbox', 'persistStorage'];

protected static INTEGER_VARS = ['memoryMbytes', 'persistStateIntervalMillis', 'systemInfoIntervalMillis'];

Expand All @@ -251,6 +263,7 @@ export class Configuration {
headless: true,
persistStateIntervalMillis: 60_000,
systemInfoIntervalMillis: 60_000,
persistStorage: true,
};

/**
Expand Down Expand Up @@ -380,7 +393,11 @@ export class Configuration {
return this.services.get(cacheKey) as MemoryStorage;
}

const storage = new MemoryStorage(options);
const storage = new MemoryStorage({
persistStorage: this.get('persistStorage'),
// Override persistStorage if user provides it via storageClientOptions
...options,
});
this.services.set(cacheKey, storage);

return storage;
Expand Down
12 changes: 12 additions & 0 deletions packages/memory-storage/src/memory-storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ export interface MemoryStorageOptions {
* @default process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false
*/
writeMetadata?: boolean;

/**
* Whether the memory storage should also write its stored content to the disk.
*
* You can also disable this by setting the `CRAWLEE_PERSIST_STORAGE` environment variable to `false`.
* @default true
*/
persistStorage?: boolean;
}

export class MemoryStorage implements storage.StorageClient {
Expand All @@ -34,6 +42,7 @@ export class MemoryStorage implements storage.StorageClient {
readonly keyValueStoresDirectory: string;
readonly requestQueuesDirectory: string;
readonly writeMetadata: boolean;
readonly persistStorage: boolean;

readonly keyValueStoresHandled: KeyValueStoreClient[] = [];
readonly datasetClientsHandled: DatasetClient[] = [];
Expand All @@ -43,6 +52,7 @@ export class MemoryStorage implements storage.StorageClient {
s.object({
localDataDirectory: s.string.optional,
writeMetadata: s.boolean.optional,
persistStorage: s.boolean.optional,
}).parse(options);

// v3.0.0 used `crawlee_storage` as the default, we changed this in v3.0.1 to just `storage`,
Expand All @@ -61,6 +71,8 @@ export class MemoryStorage implements storage.StorageClient {
this.keyValueStoresDirectory = resolve(this.localDataDirectory, 'key_value_stores');
this.requestQueuesDirectory = resolve(this.localDataDirectory, 'request_queues');
this.writeMetadata = options.writeMetadata ?? process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false;
this.persistStorage = options.persistStorage
?? (process.env.CRAWLEE_PERSIST_STORAGE ? !['false', '0', ''].includes(process.env.CRAWLEE_PERSIST_STORAGE!) : true);

initWorkerIfNeeded();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class DatasetCollectionClient implements storage.DatasetCollectionClient
id: datasetInfo.name ?? datasetInfo.id,
data: datasetInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return datasetInfo;
Expand Down
2 changes: 2 additions & 0 deletions packages/memory-storage/src/resource-clients/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ export class DatasetClient<Data extends Dictionary = Dictionary> extends BaseCli
entityDirectory: existingStoreById.datasetDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down Expand Up @@ -260,6 +261,7 @@ export class DatasetClient<Data extends Dictionary = Dictionary> extends BaseCli
entityDirectory: this.datasetDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class KeyValueStoreCollectionClient implements storage.KeyValueStoreColle
id: kvStoreInfo.name ?? kvStoreInfo.id,
data: kvStoreInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return kvStoreInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: existingStoreById.keyValueStoreDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down Expand Up @@ -304,6 +305,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: existingStoreById.keyValueStoreDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Expand Down Expand Up @@ -334,6 +336,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: this.keyValueStoreDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class RequestQueueCollectionClient implements storage.RequestQueueCollect
id: queueInfo.name ?? queueInfo.id,
data: queueInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return queueInfo;
Expand Down
3 changes: 3 additions & 0 deletions packages/memory-storage/src/resource-clients/request-queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: existingQueueById.requestQueueDirectory,
id: existingQueueById.name ?? existingQueueById.id,
writeMetadata: existingQueueById.client.writeMetadata,
persistStorage: existingQueueById.client.persistStorage,
});
}
}
Expand Down Expand Up @@ -361,6 +362,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: this.requestQueueDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand All @@ -372,6 +374,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: this.requestQueueDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down
3 changes: 3 additions & 0 deletions packages/memory-storage/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ interface MetadataUpdate<Type extends EntityType, DataType> {
entityDirectory: string;
data: DataType;
writeMetadata: boolean;
persistStorage: boolean;
}

interface EntriesUpdate<Type extends EntityType, DataType> {
Expand All @@ -99,6 +100,7 @@ interface EntriesUpdate<Type extends EntityType, DataType> {
entityDirectory: string;
data: DataType;
writeMetadata: boolean;
persistStorage: boolean;
}

interface EntryDelete<Type extends EntityType> {
Expand All @@ -107,6 +109,7 @@ interface EntryDelete<Type extends EntityType> {
action: 'delete-entry';
entityDirectory: string;
writeMetadata: boolean;
persistStorage: boolean;
data: {
id: string;
};
Expand Down
13 changes: 9 additions & 4 deletions packages/memory-storage/src/workers/worker-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ export async function handleMessage(message: WorkerReceivedMessage & { messageId
}

async function updateMetadata(message: WorkerUpdateMetadataMessage) {
// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);

// Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present
if (!message.writeMetadata) {
return;
}

// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);

// Write the metadata to the file
const filePath = resolve(dir, '__metadata__.json');
await writeFile(filePath, JSON.stringify(message.data, null, '\t'));
Expand All @@ -61,6 +61,11 @@ async function lockAndWrite(filePath: string, data: unknown, stringify = true, r
}

async function updateItems(message: WorkerUpdateEntriesMessage) {
// Skip writing files to the disk if the client has the option set to false
if (!message.persistStorage) {
return;
}

// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);
Expand Down
11 changes: 11 additions & 0 deletions packages/memory-storage/test/__shared__.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { access } from 'node:fs/promises';
import { setTimeout } from 'node:timers/promises';

export async function waitTillWrittenToDisk(path: string): Promise<void> {
try {
await access(path);
} catch {
await setTimeout(50);
return waitTillWrittenToDisk(path);
}
}
57 changes: 57 additions & 0 deletions packages/memory-storage/test/no-writing-to-disk.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import { readdir, rm } from 'node:fs/promises';
import { resolve } from 'node:path';
import { MemoryStorage } from '@crawlee/memory-storage';
import { waitTillWrittenToDisk } from './__shared__';

describe('persistStorage option', () => {
const tmpLocation = resolve(__dirname, './tmp/no-writing-to-disk');

afterAll(async () => {
await rm(tmpLocation, { force: true, recursive: true });
});

describe('when false and writeMetadata is also false', () => {
const localDataDirectory = resolve(tmpLocation, './no-metadata');
const storage = new MemoryStorage({
localDataDirectory,
persistStorage: false,
});

test('creating a key-value pair in a key-value store should not write data to the disk', async () => {
const keyValueStoreInfo = await storage.keyValueStores().getOrCreate();

const keyValueStore = storage.keyValueStore(keyValueStoreInfo.id);
await keyValueStore.setRecord({ key: 'foo', value: 'test' });

// We check that reading the directory for the store throws an error, which means it wasn't created on disk
await expect(() => readdir(localDataDirectory)).rejects.toThrow();
});
});

describe('when false and writeMetadata is true', () => {
const localDataDirectory = resolve(tmpLocation, './with-metadata');
const storage = new MemoryStorage({
localDataDirectory,
persistStorage: false,
writeMetadata: true,
});

test('creating a key-value pair in a key-value store should not write data to the disk, but it should write the __metadata__ file', async () => {
const keyValueStoreInfo = await storage.keyValueStores().getOrCreate();

const keyValueStore = storage.keyValueStore(keyValueStoreInfo.id);
await keyValueStore.setRecord({ key: 'foo', value: 'test' });

const storePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}`);

await waitTillWrittenToDisk(storePath);

const directoryFiles = await readdir(storePath);

expect(directoryFiles).toHaveLength(1);
expect(directoryFiles).toEqual([
'__metadata__.json',
]);
});
});
});
19 changes: 4 additions & 15 deletions packages/memory-storage/test/write-metadata.test.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
import { MemoryStorage } from '@crawlee/memory-storage';
import { access, readdir, rm } from 'node:fs/promises';
import { readdir, rm } from 'node:fs/promises';
import { resolve } from 'node:path';
import { setTimeout } from 'node:timers/promises';

async function waitTillWrittenToDisk(path: string): Promise<void> {
try {
await access(path);
} catch {
await setTimeout(50);
return waitTillWrittenToDisk(path);
}
}
import { waitTillWrittenToDisk } from './__shared__';

describe('writeMetadata option', () => {
const tmpLocation = resolve(__dirname, './tmp/write-metadata-tests');
Expand All @@ -29,11 +20,9 @@ describe('writeMetadata option', () => {
test('creating a data store should not write __metadata__.json file', async () => {
const keyValueStore = await storage.keyValueStores().getOrCreate();
const expectedPath = resolve(storage.keyValueStoresDirectory, `${keyValueStore.id}`);
await waitTillWrittenToDisk(expectedPath);

const directoryFiles = await readdir(expectedPath);

expect(directoryFiles).toHaveLength(0);
// We check that reading the directory for the store throws an error, which means it wasn't created on disk
await expect(() => readdir(expectedPath)).rejects.toThrow();
});

test('creating a key-value pair in a key-value store should not write __metadata__.json file for the value', async () => {
Expand Down

0 comments on commit f65e3c6

Please sign in to comment.