In [2]:
import nltk
import spacy
import pandas as pd
import networkx as nx

##  Token level features
- The full constituent starting from a head word
- The head of each token
- The dependent(s) of each token
- The Syntactic dependency relation from each token
- Part-of-speech tag of each token

In [3]:
data_path = "data/article.txt"

with open(data_path, encoding="utf-8") as f:
    content = f.read()

nlp = spacy.load("en_core_web_sm")
doc = nlp(content)

token = [tok for tok in doc]
constituent = [[t.text for t in tok.subtree] for tok in doc]  # The full constituent starting from a head word
head = [tok.head for tok in doc]  # The head of each token
dependent = [[t.text for t in tok.children] for tok in doc]  # The dependent(s) of each token
dependency = [tok.dep_ for tok in doc]  # The Syntactic dependency relation from each token
pos = [tok.pos_ for tok in doc]  # Part-of-speech tag of each token

result = pd.DataFrame({"token": token, "constituent": constituent, "head": head, "dependent": dependent, "dependency": dependency, "POS": pos})

In [4]:
result[:3]  # Example of the first 3 lines of the feature set

Unnamed: 0,token,constituent,head,dependent,dependency,POS
0,Universities,"[Universities, in, the, Netherlands]",looking,[in],nsubj,NOUN
1,in,"[in, the, Netherlands]",Universities,[Netherlands],prep,ADP
2,the,[the],Netherlands,[],det,DET


In [5]:
result.to_csv('data/token-level-features.conll', sep='\t', index=False)  # Save as the corresponding conll file

## Sentence level features
- Shortest Dependency Path of each sentence
- The length of Shortest Dependency Path of each sentence

In [6]:
sentence_list = nltk.tokenize.sent_tokenize(content)
path_list = []
length_list = []

for sentence in sentence_list:
    edges = []
    doc = nlp(sentence)
    subject = [tok for tok in doc if (tok.dep_ == "nsubj")]
    direct_object = [tok for tok in doc if (tok.dep_ == "dobj")]
    if (len(direct_object) < 1) or (len(subject) < 1):
        path_list.append(None)
        length_list.append(None)
        continue
    else:
        subject = subject[0]
        direct_object = direct_object[0]
        for token in doc:
            for child in token.children:
                edges.append(('{0}'.format(token.lower_),
                                  '{0}'.format(child.lower_)))
        graph = nx.Graph(edges)
        entity1 = str(subject).lower()
        entity2 = str(direct_object).lower()
        shortest_path = nx.shortest_path(graph, source=entity1, target=entity2)
        length = nx.shortest_path_length(graph, source=entity1, target=entity2)
        path_list.append(shortest_path)
        length_list.append(length)

In [7]:
results = pd.DataFrame({"sentence": sentence_list, "shortest_dep_path": path_list, "shortest_path_length": length_list})

In [8]:
pd.set_option('display.max_colwidth', None)
results[:3]  # Example of the first 3 lines of the feature set

Unnamed: 0,sentence,shortest_dep_path,shortest_path_length
0,"Universities in the Netherlands are looking for ways to demonstrate when students are guilty of using ChatGPT, software that uses artificial intelligence to write pieces of text.","[universities, looking, are, guilty, of, using, software]",6.0
1,"Institutions said that text written by the program can be classified as academic fraud, but it is still difficult to prove, representatives of several universities told ANP.","[institutions, said, told, anp]",3.0
2,"If students have not written something themselves, they are not allowed to present it as self-made work, explained the University of Twente.","[students, written, something]",2.0


In [9]:
results.to_csv('data/sentence-level-features.conll', sep='\t', index=False)  # Save as the corresponding conll file