# Cleanup YAGO's data

### YAGO Labels

In [1]:
import csv

with open('yagoLabels.tsv', 'r') as labels:
    reader = csv.reader(labels, delimiter='\t')
    next(reader) # Skip license descriptions

    with open('yago_entities.tsv', 'w+') as entities_file:
        entities_file.write(':ID\tname\t:LABEL\n') # TSV Header (entities)
        
        with open('yago_rels.tsv', 'w+') as rels_file:
            rels_file.write(':START_ID\t:END_ID\t:TYPE\n') # TSV Header (relations)
            
            for i, row in enumerate(reader):
                entity = row[1]
                entities_file.write('{}\t{}\tEntity\n'.format(entity, entity))

                label = row[3]
                entities_file.write('{}\t{}\tLabel\n'.format('{}{}'.format(i, label), label))

                relation = row[2]
                if relation[0] == '<':
                    relation = relation[1:-1]
                if ':' in relation:
                    relation = relation.split(':')[1]
                rels_file.write('{}\t{}\t{}\n'.format(entity, '{}{}'.format(i, label), relation))

### YAGO Facts

In [2]:
import csv

with open('yagoFacts.tsv', 'r') as facts_file:
    facts_reader = csv.reader(facts_file, delimiter='\t')
    next(facts_reader) # Skip license descriptions
    
    with open('yago_entities.tsv', 'a') as entities_file:
        with open('yago_rels.tsv', 'a') as rels_file:
            
            for row in facts_reader:
                entities_file.write('{}\t{}\tEntity\n'.format(row[1], row[1]))
                entities_file.write('{}\t{}\tEntity\n'.format(row[3], row[3]))
                rels_file.write('{}\t{}\t{}\n'.format(row[1], row[3], row[2][1:-1]))

### YAGO Types

In [3]:
import csv

with open('yagoTypes.tsv', 'r') as types_file:
    types_reader = csv.reader(types_file, delimiter='\t')
    next(types_reader) # Skip license descriptions
    
    with open('yago_entities.tsv', 'a') as entities_file:
        with open('yago_rels.tsv', 'a') as rels_file:
            for row in types_reader:
                entities_file.write('{}\t{}\tEntity\n'.format(row[3], row[3]))
                rels_file.write('{}\t{}\ttype\n'.format(row[1], row[3]))

*There are some duplicates in `yago_entities.tsv`, run following command in `bash` to delete them:*
``` 
$ awk '!a[$0]++' yago_entities.tsv > yago_entities_nodup.tsv
```

---

# Import data to Neo4j
Run following `bash` command to import data into Neo4j:

``` bash
neo4j-admin import --database="graph.db" \
	--nodes "import/yago_entities_nondup.tsv" \
	--relationships "import/yago_rels.tsv" \
	--delimiter="\t"
```