-
Notifications
You must be signed in to change notification settings - Fork 42
/
data-transformation.ts
109 lines (95 loc) · 3.38 KB
/
data-transformation.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import Reuse, { ReuseToJson, ReuseType } from '../src/backend/models/Reuse';
import * as fs from 'fs';
import * as path from 'path';
import parse from 'csv-parse/lib/sync';
console.log("Transforming CSV data to JSON...");
const dataPath = "../workflow/done";
const csvContents = fs.readdirSync(dataPath)
.filter(x => x.endsWith('.csv'))
//.filter(x => !x.endsWith('-sample.csv'))
.map(file => {
console.log(`Processing ${file}...`);
return fs.readFileSync(path.join(dataPath, file), { encoding: 'utf-8' });
})
.flatMap(contents => {
return parse(contents,
{
delimiter: ',',
columns: true,
skip_empty_lines: true,
ltrim: true,
bom: true
}) as Array<any>;
})
.filter(data => data['reused_doi'].trim().length > 0 || data['alt_url'].trim().length > 0);
function ReuseFromCSVData(csvData: any): Reuse {
if ((csvData === undefined) || (csvData === null)) {
return csvData;
}
return {
"sourceDOI": ProcessDOI(csvData['paper_doi']),
"reusedDOI": ProcessDOI(csvData['reused_doi']),
"type" : TransformType(csvData['reuse_type'], csvData),
"comment": csvData['comment'],
"sourceReference": csvData['citation_number'],
"alternativeID": ProcessAlternativeId(csvData['alt_url']),
"sourceReferenceDetail": csvData['page_num'],
"contributor" : csvData['gh_id']
};
}
function ProcessDOI(doi : string) : string {
return doi.replace("https://doi.org/", "")
.replace("https://dl.acm.org/doi/abs/", "")
.replace("https://dl.acm.org/doi/pdf/", "")
.replace("https://dl.acm.org/doi/", "")
.replace("http://dx.doi.org/", "")
.replace("https://dx.doi.org/", "")
.trim();
}
function ProcessAlternativeId(altId : string) : string {
return altId.replace("https://arxiv.org/abs/", "arxiv:");
}
function TransformType(csvType : string, csvData: any) : ReuseType {
switch (csvType.toLowerCase().trim()) {
case "method":
case "theory":
return ReuseType.METHODOLOGY;
case "tool":
case "algorithm":
return ReuseType.TOOL;
case "dataset":
case "dataset reuse":
return ReuseType.DATASET;
case "statistics":
case "statistical":
case "statistical reuse":
case "statistical method":
return ReuseType.STATISTICS;
case "metric":
case "metrics":
case "metric reuse":
return ReuseType.METRIC;
case "stepping stone":
case "stepping-stone":
case "stepping stones":
return ReuseType.STEPPINGSTONE;
case "sanity check":
case "sanity-check":
return ReuseType.SANITYCHECK;
case "replication":
return ReuseType.REPLICATION;
default: {
// console.log(csvType.toLowerCase());
// console.table(csvData)
return ReuseType.UNKNOWN;
}
}
}
const result: Array<Reuse> = csvContents.map(ReuseFromCSVData);
const outputObject =
result
.map(ReuseToJson)
.map(o => JSON.stringify(o))
.join(",")
fs.writeFileSync('./src/assets/data/reuse.json', "[" + outputObject + "]");
console.log("Transformation complete.");