Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 91 additions & 37 deletions src/lib/__tests__/useGetResourceLinks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,51 @@ import { beforeEach, describe, expect, test, vi } from 'vitest';
import { fetchUrl, transformUrl } from '../useGetResourceLinks';

const htmlWithLinks = `
<script src="my-script.js" defer=""></script>
<script src="my-styles.css" defer=""></script>
<a href="https://arxiv.org/pdf/1234.5678.pdf">arXiv</a>
<a href="https://doi.org/10.1234/abcd">DOI</a>
<a href="https://example.com/page.html">HTML</a>
<a href="https://example.com/style.css">CSS</a>
<a href="http://www.nasa.gov">NASA</a>
<a href="https://example.com/document.pdf">PDF</a>
<a href="https://example.com/page.html">HTML Duplicate</a>
<body>
<div class="header-container">
<header class="starry-background-wrapper">
<div class="logo-header">
<a href="/">
<img src="/styles/img/scix-logo.svg" alt="Science Explorer Logo">
<b>Science Explorer</b>
</a>
</div>
</header>
</div>
<div class="main-container container-sm">
<h3 class="text-center">links for <a href="/abs/2023ApJ/abstract"><b>2023ApJ</b></a></h3>
<div class="list-group">
<div class="list-group-item">
<a href="https://arxiv.org/abs/2310.03851" class="title">https://arxiv.org/abs/2310.03851</a>
</div>
<div class="list-group-item">
<a href="https://arxiv.org/pdf/2310.03851" class="title">https://arxiv.org/pdf/2310.03851</a>
</div>
<div class="list-group-item">
<a href="https://doi.org/10.3847/1538-4357/acffbd" class="title">https://doi.org/10.3847/1538-4357/acffbd</a>
</div>
<div class="list-group-item">
<a href="https://example.com/document.pdf" class="title">https://example.com/document.pdf</a>
</div>
</div>
</div>
<div class="footer-container">
<footer>
<a href="http://www.si.edu" target="_blank">Smithsonian</a>
<a href="https://www.cfa.harvard.edu/" target="_blank">CfA</a>
<a href="http://www.nasa.gov" target="_blank">NASA</a>
<a href="https://scixplorer.org/scixabout">About SciX</a>
<a href="https://scixplorer.org/feedback/general">Give Feedback</a>
<a href="https://twitter.com/scixcommunity">@scixcommunity</a>
</footer>
</div>
</body>
`;

const SKIP_URLS = [
'http://www.cfa.harvard.edu/sao',
'https://www.cfa.harvard.edu/',
'http://www.si.edu',
'http://www.nasa.gov',
];

const expectedUrls = [
{ type: 'arXiv', url: 'https://arxiv.org/pdf/1234.5678.pdf' },
{ type: 'DOI', url: 'https://doi.org/10.1234/abcd' },
{ type: 'HTML', url: 'https://example.com/page.html' },
{ type: 'arXiv', url: 'https://arxiv.org/abs/2310.03851' },
{ type: 'arXiv', url: 'https://arxiv.org/pdf/2310.03851' },
{ type: 'DOI', url: 'https://doi.org/10.3847/1538-4357/acffbd' },
{ type: 'PDF', url: 'https://example.com/document.pdf' },
];

Expand All @@ -38,12 +61,6 @@ describe('resourceLinks', () => {
expect(transformUrl('https://example.com/script.js')).toBeNull();
});

test('transformUrl filters known skipped domains', () => {
for (const url of SKIP_URLS) {
expect(transformUrl(url)).toBeNull();
}
});

test('transformUrl assigns correct type', () => {
expect(transformUrl('https://arxiv.org/pdf/foo.pdf')).toEqual({
type: 'arXiv',
Expand Down Expand Up @@ -81,6 +98,8 @@ describe('resourceLinks', () => {
test('fetchUrl returns deduplicated transformed links', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
ok: true,
redirected: false,
text: () => Promise.resolve(htmlWithLinks),
});

Expand All @@ -91,7 +110,9 @@ describe('resourceLinks', () => {
test('fetchUrl returns empty list if input has no valid links', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
text: () => Promise.resolve('<p>No links here</p>'),
ok: true,
redirected: false,
text: () => Promise.resolve('<div class="list-group"></div>'),
});

const result = await fetchUrl('fake-id');
Expand All @@ -104,14 +125,14 @@ describe('Redirected response', () => {
vi.resetAllMocks();
global.fetch = vi.fn();
});
test('fetchUrl handles 302 redirect and uses Location header', async () => {

test('fetchUrl detects browser-followed redirect via res.redirected', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
status: 302,
headers: {
get: (name: string) => (name === 'Location' ? 'https://doi.org/10.1234/foo' : null),
},
text: () => Promise.resolve(''), // not used in redirect
ok: true,
redirected: true,
url: 'https://doi.org/10.1234/foo',
text: () => Promise.resolve(''),
});

const result = await fetchUrl('test-id');
Expand All @@ -124,18 +145,51 @@ describe('Redirected response', () => {
]);
});

test('fetchUrl returns empty if redirect has no Location', async () => {
test('fetchUrl returns empty if redirected URL is not valid', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
status: 302,
headers: {
get: (() => null) as (name: string) => string | null,
},
ok: true,
redirected: true,
url: '',
text: () => Promise.resolve(''),
});

const result = await fetchUrl('test-id');
expect(result).toEqual([]);
});
});

describe('Error responses', () => {
beforeEach(() => {
vi.resetAllMocks();
global.fetch = vi.fn();
});

test('fetchUrl returns empty list on 404', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
ok: false,
status: 404,
text: () =>
Promise.resolve(
'<h3>The requested resource does not exist</h3>' +
'<footer><a href="https://scixplorer.org/scixabout">About</a></footer>',
),
});

const result = await fetchUrl('bad-bibcode');
expect(result).toEqual([]);
});

test('fetchUrl returns empty list on 500', async () => {
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
mockFetch.mockResolvedValueOnce({
ok: false,
status: 500,
text: () => Promise.resolve('Internal Server Error'),
});

const result = await fetchUrl('error-bibcode');
expect(result).toEqual([]);
});
});
47 changes: 23 additions & 24 deletions src/lib/useGetResourceLinks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { isValidURL } from '@/utils/common/isValidURL';

export const resourceUrlTypes = ['arXiv', 'PDF', 'DOI', 'HTML', 'Other'] as const;

export type ResourceUrlType = typeof resourceUrlTypes[number];
export type ResourceUrlType = (typeof resourceUrlTypes)[number];

export interface IResourceUrl {
type: ResourceUrlType;
Expand All @@ -15,14 +15,6 @@ interface IUseResourceLinksProps {
options?: UseQueryOptions<IResourceUrl[]>;
}

// TODO: slightly brittle, since these links could change over time
const SKIP_URLS = [
'http://www.cfa.harvard.edu/sao',
'https://www.cfa.harvard.edu/',
'http://www.si.edu',
'http://www.nasa.gov',
];

const URL_TYPE_MAP: Record<string, ResourceUrlType> = {
arxiv: 'arXiv',
pdf: 'PDF',
Expand All @@ -31,14 +23,13 @@ const URL_TYPE_MAP: Record<string, ResourceUrlType> = {
};

const RESOURCE_EXT_REGEX = /\.(jpg|jpeg|png|gif|webp|svg|css|js|ico|woff2?|ttf|otf|eot|map|mp4|webm)(\?|$)/i;
const URL_REGX = /href="(https?:\/\/[^"]*)"/gi;

/**
* Transforms a URL into a structured resource link object.
* @param url
*/
export const transformUrl = (url: string) => {
if (!url || typeof url !== 'string' || !isValidURL(url) || RESOURCE_EXT_REGEX.test(url) || SKIP_URLS.includes(url)) {
if (!url || typeof url !== 'string' || !isValidURL(url) || RESOURCE_EXT_REGEX.test(url)) {
return null;
}

Expand All @@ -56,29 +47,37 @@ export const fetchUrl = async (identifier: string): Promise<IResourceUrl[]> => {
const url = `/link_gateway/${encodeURIComponent(identifier)}/ESOURCE`;
const res = await fetch(url);

// check for 302 redirects
if (res.status === 302 || res.status === 301) {
const redirectUrl = res.headers.get('Location');
if (redirectUrl) {
const transformedUrl = transformUrl(redirectUrl);
return transformedUrl ? [transformedUrl] : [];
}
if (!res.ok) {
return [];
}

// single-link resources redirect directly to the target URL
if (res.redirected) {
const transformedUrl = transformUrl(res.url);
return transformedUrl ? [transformedUrl] : [];
}

const raw = await res.text();
if (!raw) {
return [];
}

const seen = new Set<string>();
const result = Array.from(raw.matchAll(URL_REGX), ([, href]) => transformUrl(href));
const parser = new DOMParser();
const doc = parser.parseFromString(raw, 'text/html');
const links = doc.querySelectorAll('.list-group-item a');

const seen = new Set<string>();
const output: IResourceUrl[] = [];
for (const res of result) {
if (res && !seen.has(res.url)) {
seen.add(res.url);
output.push(res);

for (const link of links) {
const href = link.getAttribute('href');
if (!href) {
continue;
}
const transformed = transformUrl(href);
if (transformed && !seen.has(transformed.url)) {
seen.add(transformed.url);
output.push(transformed);
}
}

Expand Down
Loading