Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow disabling storage persistence #1539

Merged
merged 7 commits into from
Sep 15, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions packages/core/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,17 @@ export interface ConfigurationOptions {
/**
* Sets the log level to the given value.
*
* Alternative to `CRAWLEE_LOG_LEVEL`.
* Alternative to `CRAWLEE_LOG_LEVEL` environment variable.
* @default 'INFO'
*/
logLevel?: LogLevel | LogLevel[keyof LogLevel];

/**
* Defines whether the storage client used should persist the data it stores.
*
* Alternative to `CRAWLEE_PERSIST_STORAGE` environment variable.
*/
persistStorage?: boolean;
}

/**
Expand Down Expand Up @@ -203,7 +210,9 @@ export interface ConfigurationOptions {
* `defaultDatasetId` | `CRAWLEE_DEFAULT_DATASET_ID` | `'default'`
* `defaultKeyValueStoreId` | `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` | `'default'`
* `defaultRequestQueueId` | `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID` | `'default'`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60e3`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60_000`
* `purgeOnStart` | `CRAWLEE_PURGE_ON_START` | `true`
* `persistStorage` | `CRAWLEE_PERSIST_STORAGE` | `true`
*
* ## Advanced Configuration Options
*
Expand All @@ -213,6 +222,8 @@ export interface ConfigurationOptions {
* `xvfb` | `CRAWLEE_XVFB` | -
* `chromeExecutablePath` | `CRAWLEE_CHROME_EXECUTABLE_PATH` | -
* `defaultBrowserPath` | `CRAWLEE_DEFAULT_BROWSER_PATH` | -
* `disableBrowserSandbox` | `CRAWLEE_DISABLE_BROWSER_SANDBOX` | -
* `availableMemoryRatio` | `CRAWLEE_AVAILABLE_MEMORY_RATIO` | `0.25`
*/
export class Configuration {
/**
Expand All @@ -233,9 +244,10 @@ export class Configuration {
CRAWLEE_DEFAULT_BROWSER_PATH: 'defaultBrowserPath',
CRAWLEE_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
CRAWLEE_LOG_LEVEL: 'logLevel',
CRAWLEE_PERSIST_STORAGE: 'persistStorage',
};

protected static BOOLEAN_VARS = ['purgeOnStart', 'headless', 'xvfb', 'disableBrowserSandbox'];
protected static BOOLEAN_VARS = ['purgeOnStart', 'headless', 'xvfb', 'disableBrowserSandbox', 'persistStorage'];

protected static INTEGER_VARS = ['memoryMbytes', 'persistStateIntervalMillis', 'systemInfoIntervalMillis'];

Expand All @@ -251,6 +263,7 @@ export class Configuration {
headless: true,
persistStateIntervalMillis: 60_000,
systemInfoIntervalMillis: 60_000,
persistStorage: true,
};

/**
Expand Down Expand Up @@ -380,7 +393,11 @@ export class Configuration {
return this.services.get(cacheKey) as MemoryStorage;
}

const storage = new MemoryStorage(options);
const storage = new MemoryStorage({
persistStorage: this.get('persistStorage'),
// Override persistStorage if user provides it via storageClientOptions
...options,
});
this.services.set(cacheKey, storage);

return storage;
Expand Down
12 changes: 12 additions & 0 deletions packages/memory-storage/src/memory-storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ export interface MemoryStorageOptions {
* @default process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false
*/
writeMetadata?: boolean;

/**
* Whether the memory storage should also write its stored content to the disk.
*
* You can also disable this by setting the `CRAWLEE_PERSIST_STORAGE` environment variable to `false`.
* @default true
*/
persistStorage?: boolean;
}

export class MemoryStorage implements storage.StorageClient {
Expand All @@ -34,6 +42,7 @@ export class MemoryStorage implements storage.StorageClient {
readonly keyValueStoresDirectory: string;
readonly requestQueuesDirectory: string;
readonly writeMetadata: boolean;
readonly persistStorage: boolean;

readonly keyValueStoresHandled: KeyValueStoreClient[] = [];
readonly datasetClientsHandled: DatasetClient[] = [];
Expand All @@ -43,6 +52,7 @@ export class MemoryStorage implements storage.StorageClient {
s.object({
localDataDirectory: s.string.optional,
writeMetadata: s.boolean.optional,
persistStorage: s.boolean.optional,
}).parse(options);

// v3.0.0 used `crawlee_storage` as the default, we changed this in v3.0.1 to just `storage`,
Expand All @@ -61,6 +71,8 @@ export class MemoryStorage implements storage.StorageClient {
this.keyValueStoresDirectory = resolve(this.localDataDirectory, 'key_value_stores');
this.requestQueuesDirectory = resolve(this.localDataDirectory, 'request_queues');
this.writeMetadata = options.writeMetadata ?? process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false;
this.persistStorage = options.persistStorage
?? process.env.CRAWLEE_PERSIST_STORAGE ? !['false', '0', ''].includes(process.env.CRAWLEE_PERSIST_STORAGE!) : true;

initWorkerIfNeeded();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class DatasetCollectionClient implements storage.DatasetCollectionClient
id: datasetInfo.name ?? datasetInfo.id,
data: datasetInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return datasetInfo;
Expand Down
2 changes: 2 additions & 0 deletions packages/memory-storage/src/resource-clients/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ export class DatasetClient<Data extends Dictionary = Dictionary> extends BaseCli
entityDirectory: existingStoreById.datasetDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down Expand Up @@ -260,6 +261,7 @@ export class DatasetClient<Data extends Dictionary = Dictionary> extends BaseCli
entityDirectory: this.datasetDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class KeyValueStoreCollectionClient implements storage.KeyValueStoreColle
id: kvStoreInfo.name ?? kvStoreInfo.id,
data: kvStoreInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return kvStoreInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: existingStoreById.keyValueStoreDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down Expand Up @@ -304,6 +305,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: existingStoreById.keyValueStoreDirectory,
id: existingStoreById.name ?? existingStoreById.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Expand Down Expand Up @@ -334,6 +336,7 @@ export class KeyValueStoreClient extends BaseClient {
entityDirectory: this.keyValueStoreDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export class RequestQueueCollectionClient implements storage.RequestQueueCollect
id: queueInfo.name ?? queueInfo.id,
data: queueInfo,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});

return queueInfo;
Expand Down
3 changes: 3 additions & 0 deletions packages/memory-storage/src/resource-clients/request-queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: existingQueueById.requestQueueDirectory,
id: existingQueueById.name ?? existingQueueById.id,
writeMetadata: existingQueueById.client.writeMetadata,
persistStorage: existingQueueById.client.persistStorage,
});
}
}
Expand Down Expand Up @@ -361,6 +362,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: this.requestQueueDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand All @@ -372,6 +374,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue
entityDirectory: this.requestQueueDirectory,
id: this.name ?? this.id,
writeMetadata: this.client.writeMetadata,
persistStorage: this.client.persistStorage,
});
}

Expand Down
3 changes: 3 additions & 0 deletions packages/memory-storage/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ interface MetadataUpdate<Type extends EntityType, DataType> {
entityDirectory: string;
data: DataType;
writeMetadata: boolean;
persistStorage: boolean;
}

interface EntriesUpdate<Type extends EntityType, DataType> {
Expand All @@ -99,6 +100,7 @@ interface EntriesUpdate<Type extends EntityType, DataType> {
entityDirectory: string;
data: DataType;
writeMetadata: boolean;
persistStorage: boolean;
}

interface EntryDelete<Type extends EntityType> {
Expand All @@ -107,6 +109,7 @@ interface EntryDelete<Type extends EntityType> {
action: 'delete-entry';
entityDirectory: string;
writeMetadata: boolean;
persistStorage: boolean;
data: {
id: string;
};
Expand Down
13 changes: 9 additions & 4 deletions packages/memory-storage/src/workers/worker-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ export async function handleMessage(message: WorkerReceivedMessage & { messageId
}

async function updateMetadata(message: WorkerUpdateMetadataMessage) {
// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);

// Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present
if (!message.writeMetadata) {
return;
}

// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);

// Write the metadata to the file
const filePath = resolve(dir, '__metadata__.json');
await writeFile(filePath, JSON.stringify(message.data, null, '\t'));
Expand All @@ -61,6 +61,11 @@ async function lockAndWrite(filePath: string, data: unknown, stringify = true, r
}

async function updateItems(message: WorkerUpdateEntriesMessage) {
// Skip writing files to the disk if the client has the option set to false
if (!message.persistStorage) {
return;
}

// Ensure the directory for the entity exists
const dir = message.entityDirectory;
await ensureDir(dir);
Expand Down
11 changes: 11 additions & 0 deletions packages/memory-storage/test/__shared__.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { access } from 'node:fs/promises';
import { setTimeout } from 'node:timers/promises';

export async function waitTillWrittenToDisk(path: string): Promise<void> {
try {
await access(path);
} catch {
await setTimeout(50);
return waitTillWrittenToDisk(path);
}
}
61 changes: 61 additions & 0 deletions packages/memory-storage/test/no-writing-to-disk.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { readdir, rm } from 'node:fs/promises';
import { resolve } from 'node:path';
import { MemoryStorage } from '@crawlee/memory-storage';
import { waitTillWrittenToDisk } from './__shared__';

describe('writeFilesToDisk option', () => {
const tmpLocation = resolve(__dirname, './tmp/no-writing-to-disk');

afterAll(async () => {
await rm(tmpLocation, { force: true, recursive: true });
});

describe('when false and writeMetadata is also false', () => {
const localDataDirectory = resolve(tmpLocation, './no-metadata');
const storage = new MemoryStorage({
localDataDirectory,
writeFilesToDisk: false,
});

test('creating a key-value pair in a key-value store should not write data to the disk', async () => {
const keyValueStoreInfo = await storage.keyValueStores().getOrCreate();

const keyValueStore = storage.keyValueStore(keyValueStoreInfo.id);
await keyValueStore.setRecord({ key: 'foo', value: 'test' });
const storePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}`);

await waitTillWrittenToDisk(storePath);

const directoryFiles = await readdir(storePath);

expect(directoryFiles).toHaveLength(0);
});
});

describe('when false and writeMetadata is true', () => {
const localDataDirectory = resolve(tmpLocation, './with-metadata');
const storage = new MemoryStorage({
localDataDirectory,
writeFilesToDisk: false,
writeMetadata: true,
});

test('creating a key-value pair in a key-value store should not write data to the disk, but it should write the __metadata__ file', async () => {
const keyValueStoreInfo = await storage.keyValueStores().getOrCreate();

const keyValueStore = storage.keyValueStore(keyValueStoreInfo.id);
await keyValueStore.setRecord({ key: 'foo', value: 'test' });

const storePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}`);

await waitTillWrittenToDisk(storePath);

const directoryFiles = await readdir(storePath);

expect(directoryFiles).toHaveLength(1);
expect(directoryFiles).toEqual([
'__metadata__.json',
]);
});
});
});
13 changes: 2 additions & 11 deletions packages/memory-storage/test/write-metadata.test.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
import { MemoryStorage } from '@crawlee/memory-storage';
import { access, readdir, rm } from 'node:fs/promises';
import { readdir, rm } from 'node:fs/promises';
import { resolve } from 'node:path';
import { setTimeout } from 'node:timers/promises';

async function waitTillWrittenToDisk(path: string): Promise<void> {
try {
await access(path);
} catch {
await setTimeout(50);
return waitTillWrittenToDisk(path);
}
}
import { waitTillWrittenToDisk } from './__shared__';

describe('writeMetadata option', () => {
const tmpLocation = resolve(__dirname, './tmp/write-metadata-tests');
Expand Down