In [1]:
import glob, os, json
from pathlib import Path
import tqdm
import spacy, pyinflect, nltk
from nltk import pos_tag, sent_tokenize
from nltk.corpus import wordnet as wn
from itertools import chain
from collections import Counter
import time
import re
nlp = spacy.load("en_core_web_trf")

In [51]:
def similar_words(word):
    similar, lemmas = [], []
    for word_set in wn.synsets("need", pos=wn.VERB):
        for related in word_set.lemma_names():
            similar.append(related)
        
        for lemma in word_set.lemmas():
            lemmas.append(lemma)
        
    verbs = ' '.join(set(similar))
    spacy_doc = nlp(verbs)
    verb_types = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
    verbs = [(token._.inflect(verb_type), verb_type) for token in spacy_doc 
                  for verb_type in verb_types if '_' not in str(token)]
    
    nouns = []
    for lemma in lemmas:
        for related_form in lemma.derivationally_related_forms():
            nouns.append(related_form.name().split('.', 1)[0])
    nouns = ' '.join(set(nouns))
    spacy_doc = nlp(nouns)
    noun_types = ["NN", "NNS", "NNP", "NNPS"]
    nouns = [(token._.inflect(noun_type), noun_type) for token in spacy_doc 
                  for noun_type in noun_types]
    nouns = [noun for noun in nouns if noun[0]]
    verbs = [verb[0] for verb in verbs]
    
    return (verbs, nouns)

In [84]:
def remove_num(sentence):
    i = 0
    if sentence.isdigit() == True:
        return 0
    if sentence[0].isdigit() == True:
        while sentence[i].isdigit() == True:
            i += 1
        return i + 1
    return 0

In [91]:
def find_children(parsed_sent, most_similar: tuple) -> tuple:
    children = []
    idx = 0
    VERBS, NOUNS = most_similar
    for idx, word in enumerate(parsed_sent):
        # Find need verb and save its children
        if (word["token"] in VERBS) and ((word["relation"] == "ROOT") or (word["relation"] == "advcl") or word["relation"] == "VERB"):
            children = word["children"]
            break
    return (idx, children)

In [88]:
def who_detection(parsed_sent, children):
    who = ''
    BREAK_LOC, CHILDREN = children
    if len(CHILDREN) == 0:
        return "Length of children is zero"
    # Loop thru tokens occuring before need verb and return the nominal subject
    for idx, token in enumerate(parsed_sent):
        if idx == BREAK_LOC:
            break
        if (token["token"] in CHILDREN) and (token["relation"] == "nsubj"):
            who = token["token"]
        
    return who      

In [89]:
def what_detection(parsed_sent, children):
    what = ''
    idx, CHILDREN = children
    if len(CHILDREN) == 0:
        return "Length of children is zero"
    # Loop thru tokens occuring after need verb and return the direct object
    while idx < len(parsed_sent):
        if (parsed_sent[idx]["token"] in CHILDREN) and (parsed_sent[idx]["relation"] == "dobj"):
            what = parsed_sent[idx]["token"]
        idx += 1
        
    return what  

In [None]:
def who_needs_what(sentence, sim_words):
    doc = nlp(sentence.lower())
    parsed = [{"token": token.text, "pos": token.pos_, "relation": token.dep_, "head": token.head.text,
               "children": [str(child) for child in token.children]} for token in doc]
    children = find_children(parsed, sim_words)
        
    return {"who": who_detection(parsed, children), "what": what_detection(parsed, children)}

In [8]:
priority_needs = []
with open("priority_needs_all.txt", 'r') as f:
    for line in f:
        priority_needs.append(line.strip())
        
priority_needs[33] = priority_needs[33][:3] + 'i' + priority_needs[33][3:]

In [8]:
test_file = all_files[74]

In [9]:
with open(test_file) as f:
    sents = [sent_tokenize(line.replace('\n', '')) for line in f]

In [None]:
all_files = sorted(glob.glob(os.path.join(str(Path.cwd()) + '/gsData2', "*.txt")))
sim_words = similar_words("need")
test_file = all_files[124]
start = time.time()
with open(test_file) as f:
    sents = [sent_tokenize(line.replace('\n', '')) for line in f]

sents = [sent for line in sents for sent in line if line]
sents = [sent[remove_num(sent):] for sent in sents]
sents = list(set(sents))
print("done")
who_what = []
for sent in sents:
    who_what.append(who_needs_what(sent, sim_words))
    
end = time.time()
elapsed = end - start
print(elapsed)

done


In [10]:
who_what

[{'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of children is zero'},
 {'who': 'Length of children is zero', 'what': 'Length of childr

In [11]:
who_list = [sent["who"] for sent in who_what]
what_list = [sent["what"] for sent in who_what]

In [14]:
test = Counter(what_list)

In [15]:
test.most_common()

[('Length of children is zero', 24991),
 ('', 444),
 ('place', 15),
 ('it', 11),
 ('ukraine', 10),
 ('you', 9),
 ('action', 7),
 ('war', 7),
 ('them', 7),
 ('time', 6),
 ('us', 6),
 ('putin', 5),
 ('more', 5),
 ('control', 5),
 ('me', 4),
 ('russia', 4),
 ('train', 4),
 ('look', 4),
 ('weapons', 4),
 ('care', 4),
 ('money', 4),
 ('him', 4),
 ('billion', 4),
 ('help', 4),
 ('support', 3),
 ('people', 3),
 ('ukrainians', 3),
 ('peace', 3),
 ('job', 3),
 ('everything', 3),
 ('question', 3),
 ('revolution', 3),
 ('toll', 3),
 ('5000', 3),
 ('part', 3),
 ('presence', 2),
 ('germany', 2),
 ('steps', 2),
 ('zelensky', 2),
 ('her', 2),
 ('lead', 2),
 ('pretext', 2),
 ('months', 2),
 ('questions', 2),
 ('biden', 2),
 ('country', 2),
 ('land', 2),
 ('austin', 2),
 ('orders', 2),
 ('trump', 2),
 ('role', 2),
 ('vehicles', 2),
 ('stop', 2),
 ('mile', 2),
 ('millions', 2),
 ('sovereignty', 2),
 ('government', 2),
 ('weapon', 2),
 ('note', 2),
 ('guardian', 2),
 ('tibet', 2),
 ('nothing', 2),
 ('her

In [130]:
test_sent = "The Russian army can continue advancing because the Ukrainian military needs additional weapons."
who_what = who_needs_what(test_sent, sim_words)
print(who_what)

{'who': 'military', 'what': 'weapons'}


In [132]:
test_sent = "Polish volunteers needs supplies immediately in Khakiv to hold off Russian reinforcements."
who_what = who_needs_what(test_sent, sim_words)
print(who_what)

{'who': 'volunteers', 'what': 'supplies'}


In [None]:
sim_words = similar_words("need")
for file in tqdm(all_files):
    with open(file) as f:
        for line in f:
            sents = [line.replace('\n', '') for line in f]
            sents = [nlp(line) for line in sents if line]
            sents = [sent[remove_num(sent):] for sent in sents]
            needs_for_day = [frequency_detection(sent, priority_needs, sim_words) for sent in tweets_for_day]
            needs_for_day = sum([x for x in needs_for_day if x], Counter())