In [6]:
import pandas as pd
from cso_classifier import CSOClassifier
import nltk
import spacy
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/d-bts/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x7f1a68c5a790>

In [12]:
df = pd.read_csv("data/anthology_conferences.csv", sep="|", na_filter=False)
df[:2]

Unnamed: 0,url,publisher,address,year,month,editor,title,ENTRYTYPE,ID,pages,...,note,pdf,abstract,semantic_scholar,semantic_scholar_authorIds,semantic_scholar_keywords,cso_syntactic,cso_semantic,cso_union,cso_enhanced
0,https://www.aclweb.org/anthology/2020.acl-main.1,Association for Computational Linguistics,Online,2020,July,,Learning to Understand Child-directed and Adul...,inproceedings,gelderloos-etal-2020-learning,1--6,...,,2020.acl-main.1.pdf,Speech directed to children differs from adult...,2020.acl-main.1.json,"['7805500', '2756960', '103538973']","['1017215', '1588157']","['linguistics', 'acoustics', 'language acquisi...","['speech signals', 'synthetic speech', 'lingui...","['linguistics', 'automatic speech recognition'...","['speech recognition', 'signal processing', 'e..."
1,https://www.aclweb.org/anthology/2020.acl-main.2,Association for Computational Linguistics,Online,2020,July,,Predicting Depression in Screening Interviews ...,inproceedings,rinaldi-etal-2020-predicting,7--18,...,,2020.acl-main.2.pdf,Despite the pervasiveness of clinical depressi...,2020.acl-main.2.json,"['19320780', '2457504', '37202877']",['8505'],"['linguistics', 'pattern languages', 'psycholi...","['latent variable', 'latent factor', 'linguist...","['latent factor', 'linguistics', 'dialogue', '...","['matrix factorizations', 'argumentation', 'sp..."


In [13]:
# Example abstract
abstract = df.loc[0]["abstract"]
abstract

'Speech directed to children differs from adultdirected speech in linguistic aspects such as repetition, word choice, and sentence length, as well as in aspects of the speech signal itself, such as prosodic and phonemic variation. Human language acquisition research indicates that child-directed speech helps language learners. This study explores the effect of child-directed speech when learning to extract semantic information from speech directly. We compare the task performance of models trained on adult-directed speech (ADS) and child-directed speech (CDS). We find indications that CDS helps in the initial stages of learning, but eventually, models trained on ADS reach comparable task performance, and generalize better. The results suggest that this is at least partially due to linguistic rather than acoustic properties of the two registers, as we see the same pattern when looking at models trained on acoustically comparable synthetic speech.'

In [14]:
# Example extracted topics
cc = CSOClassifier(modules = "both", enhancement = "first", explanation = True)
result = cc.run(abstract)
print(result)

Computer Science Ontology loaded.
Model loaded.
{'syntactic': ['speech signals', 'linguistics', 'semantic information', 'language acquisition', 'synthetic speech', 'acoustics'], 'semantic': ['speech signals', 'linguistics', 'semantic information', 'semantics', 'language acquisition', 'synthetic speech', 'acoustics'], 'union': ['semantic information', 'semantics', 'language acquisition', 'synthetic speech', 'speech signals', 'acoustics', 'linguistics'], 'enhanced': ['speech communication', 'signal processing'], 'explanation': {'linguistics': ['language acquisition', 'linguistic'], 'speech signals': ['speech', 'speech signal'], 'language acquisition': ['language acquisition'], 'semantic information': ['semantic', 'semantic information'], 'acoustics': ['acoustic', 'acoustic properties'], 'synthetic speech': ['speech', 'synthetic speech'], 'semantics': ['semantic information', 'semantic'], 'speech communication': ['speech', 'speech signal', 'synthetic speech'], 'signal processing': ['acous

In [15]:
# Create dictionary with title and abstract used in batch mode
papers = dict()
for i, row in df.iterrows():
    papers[str(i)] = {"title":row["title"], "abstract":row["abstract"], "keywords":""}
papers

{'0': {'title': 'Learning to Understand Child-directed and Adult-directed Speech',
  'abstract': 'Speech directed to children differs from adultdirected speech in linguistic aspects such as repetition, word choice, and sentence length, as well as in aspects of the speech signal itself, such as prosodic and phonemic variation. Human language acquisition research indicates that child-directed speech helps language learners. This study explores the effect of child-directed speech when learning to extract semantic information from speech directly. We compare the task performance of models trained on adult-directed speech (ADS) and child-directed speech (CDS). We find indications that CDS helps in the initial stages of learning, but eventually, models trained on ADS reach comparable task performance, and generalize better. The results suggest that this is at least partially due to linguistic rather than acoustic properties of the two registers, as we see the same pattern when looking at mod

In [16]:
# Run cso classifier on paper batch
cc = CSOClassifier(workers = 16, modules = "both", enhancement = "first", explanation = True)
result = cc.batch_run(papers)
print(result)

Computer Science Ontology loaded.
Model loaded.
Processing: 0
Processing: 1
Processing: 2
Processing: 3
Processing: 4
Processing: 5
Processing: 6
Processing: 7
Processing: 8
Processing: 9
Processing: 10
Processing: 11
Processing: 12
Processing: 13
Processing: 14
Processing: 15
Processing: 16
Processing: 17
Processing: 18
Processing: 19
Processing: 20
Processing: 21
Processing: 22
Processing: 23
Processing: 24
Processing: 25
Processing: 26
Processing: 27
Processing: 28
Processing: 29
Processing: 30
Processing: 31
Processing: 32
Processing: 33
Processing: 34
Processing: 35
Processing: 36
Processing: 37
Processing: 38
Processing: 39
Processing: 40
Processing: 41
Processing: 42
Processing: 43
Processing: 44
Processing: 45
Processing: 46
Processing: 47
Processing: 48
Processing: 49
Processing: 50
Processing: 51
Processing: 52
Processing: 53
Processing: 54
Processing: 55
Processing: 56
Processing: 57
Processing: 58
Processing: 59
Processing: 60
Processing: 61
Processing: 62
Processing: 63
Pr

KeyboardInterrupt: 

In [22]:
# Fill dataframe with cso topics 
df["cso_syntactic"] = ""
df["cso_semantic"] = ""
df["cso_union"] = ""
df["cso_enhanced"] = ""
for key, value in result.items():
    i = int(key)
    df.at[i, "cso_syntactic"] = value["syntactic"]
    df.at[i, "cso_semantic"] = value["semantic"]
    df.at[i, "cso_union"] = value["union"]
    df.at[i, "cso_enhanced"] = value["enhanced"]

In [23]:
df[:4]

Unnamed: 0,url,publisher,address,year,month,editor,title,ENTRYTYPE,ID,pages,...,note,pdf,abstract,semantic_scholar,semantic_scholar_authorIds,semantic_scholar_keywords,cso_syntactic,cso_semantic,cso_union,cso_enhanced
0,https://www.aclweb.org/anthology/2020.acl-main.1,Association for Computational Linguistics,Online,2020,July,,Learning to Understand Child-directed and Adul...,inproceedings,gelderloos-etal-2020-learning,1--6,...,,2020.acl-main.1.pdf,Speech directed to children differs from adult...,2020.acl-main.1.json,"['7805500', '2756960', '103538973']","['1017215', '1588157']","[linguistics, acoustics, language acquisition,...","[speech signals, synthetic speech, linguistics...","[linguistics, automatic speech recognition, ac...","[speech recognition, signal processing, educat..."
1,https://www.aclweb.org/anthology/2020.acl-main.2,Association for Computational Linguistics,Online,2020,July,,Predicting Depression in Screening Interviews ...,inproceedings,rinaldi-etal-2020-predicting,7--18,...,,2020.acl-main.2.pdf,Despite the pervasiveness of clinical depressi...,2020.acl-main.2.json,"['19320780', '2457504', '37202877']",['8505'],"[linguistics, pattern languages, psycholinguis...","[latent variable, latent factor, linguistics, ...","[latent factor, linguistics, dialogue, pattern...","[matrix factorizations, argumentation, speech ..."
2,https://www.aclweb.org/anthology/2020.acl-main.3,Association for Computational Linguistics,Online,2020,July,,Coach: A Coarse-to-Fine Approach for Cross-dom...,inproceedings,liu-etal-2020-coach,19--25,...,,2020.acl-main.3.pdf,As an essential task in task-oriented dialog s...,2020.acl-main.3.json,"['152613855', '9162688', '145011005', '40539650']","['6664', '27148', '83260', '48813', '1117']","[regularization, named entity recognition]","[time slots, regularization, spoken language u...","[time slots, regularization, integrated data, ...","[wireless networks, wireless sensor networks, ..."
3,https://www.aclweb.org/anthology/2020.acl-main.4,Association for Computational Linguistics,Online,2020,July,,Designing Precise and Robust Dialogue Response...,inproceedings,zhao-etal-2020-designing,26--33,...,,2020.acl-main.4.pdf,Automatic dialogue response evaluator has been...,2020.acl-main.4.json,"['46887780', '1939089', '1717105']","['2760', '879', '26812', '8878', '14829', '488...","[human evaluation, dialogue, language model]","[dialogue, human evaluation, language model, s...","[dialogue, spoken dialogue, dialogue managemen...","[argumentation, speech processing, linguistics..."


In [24]:
# Store anthology dataframe as csv
df.to_csv("data/anthology_conferences.csv", sep="|", index=False)

In [3]:
# Create cso nlp subtopic hierarchie as json file
topic_filter = '"<https://cso.kmi.open.ac.uk/topics/_>","<http://cso.kmi.open.ac.uk/schema/cso#superTopicOf>","<https://cso.kmi.open.ac.uk/topics/'

topic_hierarchie = dict()
topics = ["nlp"]

while len(topics) > 0:
    topic_path = topics.pop(0)
    topic_path_split = topic_path.split("/")
    topic = topic_path_split[-1]
    full_filter = topic_filter.replace("_", topic)
    
    dict_ = topic_hierarchie
    for t in topic_path_split[:-1]:
        dict_ = dict_[t]
    dict_[topic] = {}
    
    with open("data/CSO.3.2.csv") as f:
        for line in f.readlines():
            
            if full_filter in line:
                x = line
                x = x.replace(full_filter, "")
                x = x.replace('>"\n', "")
                dict_[topic][x] = dict()
                topics.append(topic_path + "/" + x)
                
topic_hierarchie

{'nlp': {'abstracting_and_indexing': {'subject_headings': {}},
  'parse_trees': {},
  'part_of_speech': {'pos_tagging': {'pos_taggers': {}}},
  'part_of_speech_tagging': {'pos_tagging': {'pos_taggers': {}}},
  'part-of-speech_tagging': {'pos_tagging': {'pos_taggers': {}}},
  'natural_language_text': {},
  'natural_language_understanding': {},
  'lexical_resources': {'wordnet': {}},
  'topic_model': {'hierarchical_dirichlet_process': {}},
  'text_processing': {'word_processing': {'electronic_document': {'style_sheets': {}},
    'electronic_documents': {'style_sheets': {}}},
   'text_mining': {'text_mining_techniques': {},
    'text_document': {},
    'textual_data': {},
    'text_data': {},
    'text_representation': {}},
   'text_summarization': {'automatic_text_summarization': {},
    'automatic_summarization': {},
    'sentence_extraction': {}},
   'term_frequency': {'inverse_document_frequency': {},
    'document_frequency': {}},
   'document_classification': {},
   'textual_entailm

In [9]:
import json
with open("data/cso_nlp_hierarchie.json", "w") as jf:
    json.dump(topic_hierarchie, jf)