-
Notifications
You must be signed in to change notification settings - Fork 0
/
highlighter.py
139 lines (111 loc) · 4.42 KB
/
highlighter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import io
import os
import time
from unitex_tagger import TextTagger
from text_extractor import PDFExtractor
from werkzeug.utils import secure_filename
from gunicorn.app.base import BaseApplication
from flask import Flask, render_template, request, redirect, flash, Markup, jsonify
UPLOAD_FOLDER = 'static/uploads/'
ALLOWED_EXTENSIONS = {'pdf'}
pdf_file = None
def clean_directory(working_file):
""" Remove files from given directory """
folder = 'static/uploads/'
for filename in os.listdir(folder):
if filename == working_file:
file_path = os.path.join(folder, filename)
try:
os.unlink(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
class GunicornApplication(BaseApplication):
def __init__(self, application, config=None):
self.__application = application
self.__config = config or {}
super(GunicornApplication, self).__init__()
def load_config(self):
config = {key: value for key, value in self.__config.items()
if key in self.cfg.settings and value is not None}
for key, value in config.items():
self.cfg.set(key.lower(), value)
def load(self):
return self.__application
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
def convert_pdf(file_path, remove_images=False):
"""
Converts PDF to text. See PDFExtractor class for more information.
"""
extractor = PDFExtractor(file_path)
extractor._pdf_to_html(remove_images)
def tag_text():
"""
Tags text using Unitex grammars. See TextTagger class for more information.
"""
# Text tagging process.
# Highlight grammar is a composed graph containing text-matching grammars
# and their corresponding assigned tags.
tagger = TextTagger('config/unitex-example.yaml')
tagger.import_text('full_pdf.html')
pdf_html = tagger.tag_text('config/graphs/highlight.fst2')
# Structures matches to pass them in HTML template.
return (tagger.matched, Markup(pdf_html))
@app.route('/', methods=['GET', 'POST'])
def index():
# Initialisation for initial empty template
pdf_file = None
file_name = None
text_content = [""]
matches = {'sent': {'': ''}, 'entity': {'': ''}, 'idx': {'': ''}}
pdf = None
remove_images = False
# Gets file uploaded from webapp
if request.method == 'POST':
# Checks for file import success
if 'file' not in request.files:
flash("No file part")
return redirect(request.url)
# Gets files uploaded
uploaded_file = request.files['file']
file_name = uploaded_file.filename # Name of the file 'name.pdf'
# Handles 'file is not a PDF'
if not allowed_file(uploaded_file.filename):
print(uploaded_file.filename)
print('File is not a PDF')
return redirect(request.url)
# Gets secure name of PDF for rendering
filename = secure_filename(uploaded_file.filename)
# Saves and applies processing to PDF
if uploaded_file.filename != '':
pdf_file = os.path.join(app.config['UPLOAD_FOLDER'], filename)
uploaded_file.save(pdf_file)
if request.form.get('rmv_img'):
print("removing IMG")
remove_images = True
convert_pdf(pdf_file, remove_images)
matches, pdf = tag_text()
# Renders webapp.
return render_template('index.html', text=text_content,
matches=matches, file_path=pdf_file,
file_name=file_name, clean=clean_directory, pdf=pdf)
def shutdown_server():
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
@app.route('/shutdown', methods=['POST'])
def shutdown():
shutdown_server()
return 'Server shutting down...'
@app.route('/clean', methods=['GET', 'POST'])
def clean_pdf():
to_clean = request.args.get('param_file')
time.sleep(5)
return jsonify(result=clean_directory(to_clean))
if __name__ == '__main__':
GunicornApplication(app).run()
# app.run(debug=False)