-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanup.py
41 lines (36 loc) · 1015 Bytes
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import textract
import os
DEFAULT_LANGUAGE = 'deu'
DEFAULT_INPUT_DIR = "inputdocuments"
DEFAULT_OUTPUT_DIR = "finaldocuments"
cwd = os.getcwd()
list = []
def getFileName( path ):
for subdir, dirs, files in os.walk(path):
for file in files:
#print os.path.join(subdir, file)
filepath = subdir + os.sep + file
hidden_file = filepath
if file != '.DS_Store':
list.append(filepath)
return list
getFileName(cwd + "/" + DEFAULT_INPUT_DIR)
# print(list)
for l in list:
# print(l)
file_name = os.path.splitext(l)[0] + '.txt'
file_ext = os.path.splitext(l)[1]
print(file_name)
if (file_ext != '.txt') and (l != file_name):
parsed_text = textract.process(l)
# # print(parsed_text)
doc_path = DEFAULT_OUTPUT_DIR + "/"
# # Create directory if not exists
try:
os.makedirs(doc_path)
except OSError:
if not os.path.isdir(doc_path):
raise
file = open(file_name, 'w+')
file.write(parsed_text)
file.close()