-
Notifications
You must be signed in to change notification settings - Fork 0
/
FastNotes.py
138 lines (95 loc) · 3.26 KB
/
FastNotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#Beautiful modules
from bs4.builder import SAXTreeBuilder
import requests
from bs4 import BeautifulSoup
#nltk
import nltk
from nltk import tokenize
#tkinter modules
import tkinter as tk
from tkinter import *
#docx module
import docx
from docx import Document
from docx.enum.style import WD_BUILTIN_STYLE
from docx.enum.text import WD_COLOR_INDEX
#test sample: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2797383/
#function that gets the link that the user inputs and outputs notes of website
def get_link():
#gets link
link = entry.get()
output = ''
#by using beautiful soup and request we can access the website
res = requests.get(link)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
#specific elements we want
whitelist = [
'p',
'li',
]
#specfic characters that we look for
important_words = [
'%',
'cancer',
]
#gets all the text from the p and li elements and formats into string
for t in text:
if t.parent.name in whitelist:
output += '{} '.format(t)
#uses nltk to make the string output into a list of sentences
a = tokenize.sent_tokenize(output)
#if the sentence has the important_words it saves it into matching
matching = [s for s in a if any(xs in s for xs in important_words)]
#loops the sentences that are matching and creates a label in our tkinter gui
for i in range(len(matching)):
exec('Label%s=Label(window,text="%s", anchor="w", width=720, wraplengt=1000)\nLabel%d.pack()' % (i,matching[i],i))
def get_doc():
#doc setup
doc = docx.Document()
doc.add_heading('NOTES')
#gets link
link = entry.get()
output = ''
#by using beautiful soup and request we can access the website
res = requests.get(link)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
#specific elements we want
whitelist = [
'p',
'li',
]
#specfic characters that we look for
important_words = [
'%',
'cancer',
]
#gets all the text from the p and li elements and formats into string
for t in text:
if t.parent.name in whitelist:
output += '{} '.format(t)
#uses nltk to make the string output into a list of sentences
a = tokenize.sent_tokenize(output)
#if the sentence has the important_words it saves it into matching
matching = [s for s in a if any(xs in s for xs in important_words)]
#loops the sentences that are matching and creates a label in our tkinter gui
for i in range(len(matching)):
doc.add_paragraph(matching[i])
doc.save('Notes.docx')
#tkinter gui
window = tk.Tk()
link = tk.StringVar()
window.title('Fast Notes')
window.geometry('1080x720+60+60')
label = Label(window, text = 'Quick Notes!')
label.pack(side = TOP, pady = 5)
entry = Entry(window, width=500, textvariable=link, bd=5)
entry.pack(side = TOP, pady = 5)
button = Button(window , text = 'Generate Notes', command=get_link)
button.pack(side = TOP, pady = 5)
doc_button = Button(window , text = 'Generate Doc', command=get_doc)
doc_button.pack(side = TOP, pady = 5)
window.mainloop()