/
csv_parsing.py
112 lines (87 loc) · 3.6 KB
/
csv_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import csv
import sys
import re
import string
import locale
RELATIONSHIPS = [
"Tochter",
"Sohn",
"Beihälterin",
"Beihälter",
"Ehefrau",
"Ehemann",
"Vater",
"Mutter",
"Schwester",
"Bruder"
]
FIELDS = [
'Ort',
'Beruf',
'Kennzeichen',
'Personalien'
]
class Person:
details = {}
def __init__(self):
self.surname = ""
self.firstname = ""
self.details = {}
def run(input_filename):
with open(input_filename) as f:
persons = []
reader = csv.DictReader(f)
for row in reader:
try:
caption = row["Original caption"]
person = Person()
# Some of the lines have unnecessary newlines. Get rid of them.
for m in re.findall("(.+)(\n)([^:]+?\n)", caption):
caption = string.replace(caption, m[0] + m[1] + m[2], m[0] + " " + m[2])
for fieldname in FIELDS:
# Separate fields for easier parsing
if fieldname in caption and "\n" + fieldname not in caption:
caption = string.replace(caption, fieldname, "\n" + fieldname)
if re.search("\n" + fieldname + ": (.*?)\n", caption):
# We parse the Personalien separately later
if fieldname == "Personalien":
personalien = re.search("\n" + fieldname + ": (.*?)\n", caption).group(1)
else:
person.details[fieldname] = re.search("\n" + fieldname + ": (.*?)\n", caption).group(1)
names = re.match("\n?(.*?)\n", caption).group(1)
# Parse out different names
for w in ["vulgo", "genannt"]:
if w in names:
person.details["nickname"] = names.split(", " + w + " ")[1]
names = names.split(", " + w + " ")[0]
if "alias" in names:
person.details["aliases"] = names.split(", alias ")[1].split(", ")
names = names.split(", alias ")[0]
person.surname = names.split(", ")[0]
person.firstname = names.split(", ")[1]
person.details["name"] = person.firstname + " " + person.surname
# What can we get out of the Personalien?
if re.search("([0-9]{2}) Jahre alt", personalien):
person.details["age"] = re.search("([0-9]{2}) Jahre alt", personalien).group(1)
if re.search("geboren.* ([0-9]{4})", personalien):
person.details["age"] = 1853 - int(re.search("geboren.* ([0-9]{4})", personalien).group(1))
person.details["relationships"] = {}
for relationship in RELATIONSHIPS:
if relationship in personalien:
m = re.search(relationship + " de(r|s) ([\w .]*)(,|\(|\n| und)?", personalien.decode("utf-8"), re.U)
if m:
print relationship, m.group(2)
person.details["relationships"][relationship] = m.group(2)
print(person.details["relationships"])
persons.append(person)
# print(person.firstname, person.surname, str(person.details))
except AttributeError as e:
print caption
print e.message
print
continue
return persons
if __name__ == '__main__':
input_filename = sys.argv[1]
run(input_filename)