-
Notifications
You must be signed in to change notification settings - Fork 0
/
SearchMedium.py
162 lines (138 loc) · 6.45 KB
/
SearchMedium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/python3
__author__ = 'Antonio Masotti'
__date__ = '20.09.2018'
'''
Main script collection
'''
#import necessary packages
import os
from tkinter import filedialog as tkFileDialog
import re
import requests
from bs4 import BeautifulSoup
from auxiliary.Auxialiary import *
# Middle endings (Homer endings included)
# TODO: use an accent decoder / ASCII simplifier to ignore accents
M_ENDUNGEN = ["(μαί|μαι|μαῖ|μᾶι)",
"(άι|αι|αί|αῖ|ᾶι)",
"(ῃ|ῇ|ῄ|ει|εί|εῖ)",
"(τάι|ται|ταί|ταῖ|τᾶι)",
"(το|τό)",
"(μεθα|μεθα|μέθα|μεθά)",
"(σθέ|σθε|σθ)",
"(νταί|νται|ντάι|νταῖ|ντ|ντο|ντό)",
"(σθαί|σθάι|σθαι|σθ)",
"(μήν|μην|μῆν)",
"(σο|σό)",
"(ω|ώ|ῶ)",
"(ου|ού|οῦ)",
"(ἱό|ιό|ιο)",
"(σθω|σθώ|σθῶ)",
"(σθων|σθών|σθῶν)",
"(σθωσαν|σθώσαν|σθῶσαν|σθωσαν|σθωσάν)",
"(μέν\w{1,3}|μεν\w{1,3})"]
print('Choose where to save the output file: ')
os.system("pause")
OUTPUT_DIRECTORY = tkFileDialog.askdirectory()
os.chdir(OUTPUT_DIRECTORY)
NAME_OUTPUT = input("How do you want to name the output file: ")
OUTPUT_FILE = str(NAME_OUTPUT)+".txt"
OUTPUT_STATISTICS = str(NAME_OUTPUT)+"_Stat.txt"
with open(OUTPUT_FILE, "a", encoding="UTF-8") as table:
table.truncate(0)
table.write("id_text-token-lemma-translation-pos-mediumTantum\n")
while True:
VERBAL_FORMS = []
print('Select the directory with the raw text files: ')
os.system("pause")
TEXT_DIRECTORY = tkFileDialog.askdirectory()
os.chdir(TEXT_DIRECTORY)
FILE_LIST = [os.path.abspath(x) for x in os.listdir(TEXT_DIRECTORY)]
for element in FILE_LIST:
not_found = []
media_tantum = []
non_media_tantum = []
#file_path = tkFileDialog.askopenfilename()
with open(element, "r", encoding="UTF-8") as text_file:
raw_text = text_file.read()
id_text = os.path.basename(os.path.splitext(element)[0])
# new text = text without verse number
text = delete_number(raw_text)
#search for the ending patterns in the text
for endung in M_ENDUNGEN:
regex = "\\b(\\w+"+endung+")\\b"
#print(regex)
matches = re.findall(regex, text, re.MULTILINE)
VERBAL_FORMS.append(matches)
# prepare results and get list of verbal forms
preprocessed = trasforma_result(VERBAL_FORMS)
preprocessed_verb = preprocessed[0]
# iterate through the verb forms founded
counter = 0
for form in preprocessed_verb:
counter += 1
print("-------------------------------------")
print("PROGRESS: " + str(counter) +
" of " + str(len(preprocessed_verb)) +
" file currently analyzed: " + str(id_text))
print("-------------------------------------")
link = "http://www.perseus.tufts.edu/hopper/morph?l="+form+"&la=greek"
get_source = requests.get(link)
source = get_source.text
parsed = BeautifulSoup(source, 'lxml')
morpho_check = parsed.select('td')
# iterate through the <td> css classes of Perseus
i = 0
while i < len(morpho_check):
check2 = re.search(r"(<td>((verb|part)(.*))<\/td>)", str(morpho_check[i]), 0)
if check2 is None:
i += 1
continue
else:
morpho_parsing = str(check2.group(2))
break
if check2 == [] or check2 is None:
not_found.append(form)
continue
else:
# set the regex to find the lemma
lemma = parsed.select('h4.greek')[0].text.strip()
if lemma == '':
not_found.append(form)
continue
else:
# get the translation from Perseus
translation = parsed.select('span.lemma_definition')[0].text.strip()
check_regex = r"(\w*(μάι|μαι|μαί|μαῖ|μᾶι)\b)"
subst = "\g<1>"
check = re.search(check_regex, lemma, re.MULTILINE)
if check != None:
mtantum = 1
media_tantum.append(lemma)
else:
mtantum = 0
non_media_tantum.append(lemma)
table.write(str(id_text)+
"-"+str(form)+"-"+str(lemma)+
"-"+str(translation)+
"-"+str(morpho_parsing)+
"-"+str(mtantum)+
"\n")
os.chdir(OUTPUT_DIRECTORY)
with open(OUTPUT_STATISTICS, "a", encoding="UTF-8") as stat:
stat.write("----STATISTICS OF "+str(id_text)+"----\n")
stat.write("-------------------------------\n---NOT-FOUND or NOT-VERB---")
lemma_not_found = ",".join(e for e in not_found)
stat.write("\n" + str(lemma_not_found) + "\n")
stat.write("--------------------------------------------------------\n")
stat.write("Total verbal forms found: " + str(len(preprocessed_verb)) +
"\n" +"Media_tantum: " + str(len(media_tantum)) + "\n" +
"Not-Media_tantum:" + str(len(non_media_tantum)) + "\n" +
"Forms not found on Perseus or not verbs: "
+ str(len(not_found)) +
"\n--------------------------------------------------------\n\n")
# ask the user if the another directory should be scanned
if again():
continue
else:
break