In [None]:
pip install git+https://github.com/filipinascimento/WOS.git -U

In [None]:
pip install dbgz -U

In [None]:
pip install git+https://github.com/filipinascimento/WOSRaw.git

In [29]:
import WOSRaw as wos
import xnetwork as xnet

In [1]:
import json
import glob
import pandas as pd
from nltk import ngrams

In [2]:
files = glob.glob("data/sens_papers*.json")

In [36]:
first_term = [('machine', 'learning'),
 ('deep', 'learning'),
 ('artificial', 'intelligence'),
 ('large', 'language', 'model'),
 ('natural', 'language', 'processing'),
 ('computational', 'vision'),
 ('computer', 'vision')]

second_term_startwith = ['sensor', 'biosensor', 'immounosensor', 
               'genosensor', 'immunoassay', 'genoassay', 'assay', 
               'e-tongue', 'etongue', 'e-nose', 'enose']
second_term = [('sensing'),
('electronic', 'tongue'),               
('electronic', 'nose')]

third_term = ['diagnos', 'detect', 'predict', 'screen', 'measur', 'monitor']


def is_valid_start_or(terms, title):
    words = title.split()
    for word in words:
        for term in terms:
            if word.startswith(term):
                return True
    return False

def is_valid_n_gram(terms, title):
    terms1 = set(terms)
    title_ngrams = list(ngrams(title, 1)) + list(ngrams(title, 2)) + list(ngrams(title, 3))    
    title_ngrams = set(title_ngrams)
    return len(title_ngrams & terms1) > 0

def valid_complete(title):
    title1 = title.lower()
    valid_first = is_valid_n_gram(first_term, title1.split())
    # print(valid_first, 'first')
    valid_second = is_valid_n_gram(second_term, title1) or is_valid_start_or(second_term_startwith, title1)
    # print(valid_second, 'second')
    valid_third = is_valid_start_or(third_term, title1)
    # print(valid_third, 'third')
    
    return valid_first and valid_second and valid_third

# def is_valid_term(title):
#     keyterms1 = {('ml',), ('ai',)}
#     keyterms2 = {('machine', 'learning'), ('artificial', 'intelligence')}
#     keyterm3 = 'sens'
    
#     ngrams1 = set(ngrams(title, 1))
#     ngrams2 = set(ngrams(title, 2))

#     valid = False
#     for ngram in ngrams1:
#         if ngram[0].startswith(keyterm3):
#             valid = True

#     if not valid:
#         return False
#     if len(keyterms1 & ngrams1) > 0:
#         return True
#     if len(keyterms2 & ngrams2) > 0:
#         return True
        
#     return False



In [None]:
titles = []
abstracts = []
count = 0
valid_papers = []
for file in files:
    data = pd.read_json(file)
    for idx, entry in data.iterrows():
        title = wos.utilities.getTitle(entry)
        is_valid1 = valid_complete(title)
        # print(ngrams2)
        abst = ' '.join(wos.utilities.getAbstract(entry)).lower()
        is_valid2 = False
        
        if abst != '':
            is_valid2 = valid_complete(abst)
    
        if is_valid1 or is_valid2:
            count += 1
            refs = wos.utilities.getReferences(entry)
            uids_refs = wos.utilities.getReferencesUIDs(refs)
            valid_papers.append((entry['UID'], uids_refs))
            titles.append(title)
            abstracts.append(abst)
            # break
            
    # break

In [59]:
titles[:10]

['AI-Augmented Behavior Analysis for Children With Developmental Disabilities: Building Toward Precision Treatment',
 'Hand tremor detection in videos with cluttered background using neural network based approaches',
 'Predictive asset availability optimization for underground trucks and loaders in the mining industry',
 'Integrating BIM into sensor-based facilities management operations',
 'Detecting and locating cyber and physical stresses in smart grids using the k-nearest neighbour analysis of instantaneous correlation of states',
 'Emphasizing privacy and security of edge intelligence with machine learning for healthcare',
 'DeepNeurite (TM): Identification of neurites from non-specific binding of fluorescence probes through deep learning',
 'Visual image and radio signal fusion identification based on convolutional neural networks',
 'Knowledge extraction for automatic driving control by using local correlation features',
 'ECG diagnostic support system (EDSS): A deep learning ne

In [40]:
len(valid_papers)

11341

In [61]:
len(titles)

45132

In [41]:
output = open('data/sens_ml_ai_papers_03_02.json', 'w')
output.write(json.dumps(valid_papers))
output.close()

In [43]:
papers = pd.read_json('data/sens_ml_ai_papers_03_02.json')
print(len(papers))
papers.head()

11341


Unnamed: 0,0,1
0,WOS:000711703000007,"[WOS:000454996700009, , WOS:000549646700010, I..."
1,WOS:000672437400001,"[WOS:000280543604256, WOS:000291142600016, WOS..."
2,WOS:000612302800001,"[WOS:000377043400002, , INSPEC:14339154, , WOS..."
3,WOS:000609377000001,"[, , , WOS:000214300300002, WOS:00021003240001..."
4,WOS:000632562300001,"[WOS:000442348400005, , , WOS:000347558200001,..."


In [44]:
edges = []
valid_papers = set(papers[[0]].values.flatten())
print(len(valid_papers))

11339


In [45]:
for idx, paper in papers.iterrows():
    for p in paper[1]:
        if p in valid_papers:
            pair = (paper[0], p)
            edges.append(pair)

In [46]:
len(edges)

6506

In [47]:
valid_papers = list(valid_papers)

In [48]:
from igraph import Graph

In [49]:
net = Graph()
net.add_vertices(len(valid_papers))
net.vs['name'] = valid_papers
net.add_edges(edges)

In [50]:
net.vcount()

11339

In [51]:
net.ecount()

6506

In [52]:
title_map = dict(zip(papers[[0]].values.flatten(), titles))

In [61]:
titles[:5]

['AI-Augmented Behavior Analysis for Children With Developmental Disabilities: Building Toward Precision Treatment',
 'Hand tremor detection in videos with cluttered background using neural network based approaches',
 'Predictive asset availability optimization for underground trucks and loaders in the mining industry',
 'Integrating BIM into sensor-based facilities management operations',
 'Detecting and locating cyber and physical stresses in smart grids using the k-nearest neighbour analysis of instantaneous correlation of states']

In [53]:
abst_map = dict(zip(papers[[0]].values.flatten(), abstracts))

In [54]:
len(title_map)

11339

In [62]:
net.vs['title'] = [title_map[uid] for uid in net.vs['name']]

In [63]:
net.vs['title'][:10]

['A Passive Learning Sensor Architecture for Multimodal Image Labeling: An Application for Social Robots',
 'Data-Driven Condition Monitoring of Mining Mobile Machinery in Non-Stationary Operations Using Wireless Accelerometer Sensor Modules',
 'Deep Learning Based Prediction Towards Designing A Smart Building Assistant System',
 'Deep Person Detection in Two-Dimensional Range Data',
 'Machine Vision for UAS Ground Operations',
 'A deep learning based secured energy management framework within a smart island',
 'Automatic Video Editing for Sensor-Rich Videos',
 'Using Touchscreen Interaction Data to Predict Cognitive Workload',
 'Surface EMG vs. High-Density EMG: Tradeoff Between Performance and Usability for Head Orientation Prediction in VR Application',
 'Sniff Species: SURMOF-Based Sensor Array Discriminates Aromatic Plants beyond the Genus Level']

In [65]:
net.vs['abstract'] = [abst_map[uid] for uid in net.vs['name']]

In [67]:
# net.vs['abstract'][:5]

In [68]:
xnet.igraph2xnet(net, 'cit_sensos_network_daniel_06_02.xnet')

In [69]:
net.vs.attributes()

['name', 'title', 'abstract']