/
code_task1.py
126 lines (96 loc) · 3.63 KB
/
code_task1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 6 14:20:15 2019
@author: aashaar
"""
import glob
import errno
import nltk
import spacy
from spacy import displacy
#nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import copy
from itertools import chain
#from nltk.stem import PorterStemmer
path = 'E:/UTD/4th Sem/Natural Language Processing CS 6320/Project/WikipediaArticles/*.txt' #note C:
#Tokenization:
docs = [] # stores every document in one index of the arrays
sent_tokens = [] # stores sentence tokens of all the documents in one single array
word_tokens = [] # stores word tokens of all the documents in one single array
files = glob.glob(path)
for name in files:
try:
with open(name,encoding="utf-8-sig") as f:
print(f.read())
file = f.read()
docs.append(file)
word_tokens.extend(word_tokenize(file))
sent_tokens.extend(sent_tokenize(file))
except IOError as exc: #Not sure what error this is
if exc.errno != errno.EISDIR:
raise
#Lemmatization:
lemmatize_word = []
lemmatizer = WordNetLemmatizer()
for word in word_tokens:
lemmatize_word.append(lemmatizer.lemmatize(word))
# Parts of Speech Tagging:
POS_tags = []
POS_tags = nltk.pos_tag(word_tokens)
#Dependency Graphs with Spacey:
en_nlp =spacy.load('en_core_web_sm')
temp_sent = ['My name is Aashaar', 'he was born in 1994','Apple was founded in 1998','Microsoft was founded in 1800','Who founded Apple?','When did Abraham Lincoln die?','When was the Gettysburg address by Abraham Lincoln']
dependency_parse_dict = dict()
#for sentence in sent_tokens:
for sentence in temp_sent:
doc = en_nlp(sentence)
#print(doc.root)
sent= list(doc.sents)
for s in sent:
#print(s.root)
if s.root.text in dependency_parse_dict:
dependency_parse_dict[s.root.text].append(s)
else:
tempList = []
tempList.append(s)
dependency_parse_dict[s.root.text] = tempList
#dependency_parse_dict
#Extract Synonyms, Hypernyms, Hyponyms, Holonyms & Meronymns:
lists = ['Abraham', 'America', 'slavery', 'founded','Apple','lands']
temp_syn=[]
synonymns_dict = dict()
temp_hyper = []
hypernyms_dict = dict()
temp_hypo = []
hyponyms_dict = dict()
temp_mero = []
meronyms_dict = dict()
temp_holo = []
holonyms_dict = dict()
#for word in word_tokens:
for word in lists:
for i,j in enumerate(wn.synsets(word)):
temp_syn.extend(wn.synset(j.name()).lemma_names())
temp_hyper.extend(list(chain(*[l.lemma_names() for l in j.hypernyms()])))
temp_hypo.extend(list(chain(*[l.lemma_names() for l in j.hyponyms()])))
temp_mero.extend(list(chain(*[l.lemma_names() for l in j.part_meronyms()])))
temp_holo.extend(list(chain(*[l.lemma_names() for l in j.part_holonyms()])))
synonymns_dict[word] = copy.deepcopy(temp_syn)
hypernyms_dict[word] = copy.deepcopy(temp_hyper)
hyponyms_dict[word] = copy.deepcopy(temp_hypo)
meronyms_dict[word] = copy.deepcopy(temp_mero)
holonyms_dict[word] = copy.deepcopy(temp_holo)
temp_syn.clear()
temp_hyper.clear()
temp_hypo.clear()
temp_mero.clear()
temp_holo.clear()
#Named Entity Recognition:
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
doc = nlp('When was the Gettysburg address by Abraham Lincoln?')
print([(X.text, X.label_) for X in doc.ents])