In [1]:
import commons.parse
import commons.graph
import commons.scores
import networkx as nx
import warnings
import nltk

In [2]:
warnings.filterwarnings("ignore")

In [None]:
commons.parse.update_allowed_forbidden_files()

In [None]:
parsed_articles = commons.parse.parse_and_sample(15)

print("Sampling and parsing complete \n")

#creating a GraphMaker object that encapsulates the chosen stemmer and stopwords list

gm = commons.graph.GraphMaker('resources/longStopwords.txt','SNO')

# setting up a proper nx.Graph() as Article's co-occurrence graph 

for i in range(0, len(parsed_articles)):
    print("I am now building the graph for article at index "+ str(i))
    parsed_articles[i].graph = gm.buildGraph(parsed_articles[i].abstract)#, parsed_articles[i].filename)


#If one wanted to print a graph, given an article:
print(parsed_articles[5].filename)
print(parsed_articles[5].abstract)
print(parsed_articles[5].graph.nodes)
commons.graph.printGraph(parsed_articles[5].graph)

In [None]:
for a in parsed_articles:

    myDictionary = commons.graph.approximateClosenessCentrality(a.graph, 10)
    nx.set_node_attributes(a.graph, myDictionary, name = 'apprCC')

    nx.set_node_attributes(a.graph, commons.graph.localPageRankApprox(a.graph), name = 'apprPR')

    nx.set_node_attributes(a.graph, commons.graph.improvedEstimateLCC(a.graph,  0.7), name='apprLCC')

In [None]:
print(parsed_articles[4].abstract)

In [None]:
gm = commons.graph.GraphMaker('resources/longStopwords.txt','SNO')

ggraph = gm.buildGraph("This data article presents the experimental evidences of the effect of TiO2-fluorine doped tin oxide interface annealing and Ni(OH)2 cocatalysts on the photoelectrochemical  structural  morphological and optical properties of Ni(OH)2/CdS/ZnIn2S4/TiO2 heterojunction. The Raman spectroscopy exhibits the sharp features of the rutile phase of TiO2 and in agreement with the X-ray diffraction data. The band gap energy of the 500 °C sample was found to be 3.12 eV  further it was increased to 3.20  3.22 eV for samples annealed at 600 and 700 °C respectively. The decrease in the band gap energy at 500 °C related to the oxygen vacancies and was analysed by photoluminescence spectroscopy analysis. The synthesis  characterization methods and other experimental details of TiO2 based heterostructure are also provided. The presence of CdS and ZnIn2S4 coating on surface of TiO2 electrodes providing a high surface area  extended visible absorption and helps to improve the change separation. This data article contains data related to the research article entitled “Highly efficient and stable 3D Ni(OH)2/CdS/ZnIn2S4/TiO2 heterojunction under solar light: Effect of an improved TiO2/FTO interface and cocatalyst” (Mahadik et al.  2017) [1]")

commons.graph.printGraph(ggraph)

In [None]:
myDictionary= nx.pagerank(ggraph,0.4)
print(myDictionary)

In [4]:
# pagerank
gm = commons.graph.GraphMaker('resources/longStopwords.txt','LAN')
articles = commons.parse.parse_and_sample(15, gm)
an_article = articles[5]
article_graph = gm.buildGraph(an_article.abstract)
articles[11].abstract

'The South African genus Gymnodiscus Less. (Senecioneae: Othonninae) is distinguished from other genera in subtribe Othonninae by its annual habit and lack of pappus in the disc florets. We recognise two species of Gymnodiscus differing in leaf morphology number of involucral bracts and florets shape of the ray florets and cypsela vestiture. A comprehensive taxonomic treatment is presented including descriptions complete nomenclature and typification illustrations and geographical distribution. Gymnodiscus orbicularifolius Sch. Bip. is treated as a synonym of Gymnodiscus capillaris L.f.'

In [5]:
keywords = articles[11].kw
keywords

['new', 'synonym', 'nomenc', 'othonn', 'southern', 'afric', 'taxonom']

In [6]:
pagerank_res = nx.pagerank(articles[11].graph)
sorted_res = sorted(pagerank_res.items(), key=lambda item: item[1], reverse=True)
len(sorted_res)

45

In [10]:
prec_res = [el[0] for el in sorted(pagerank_res.items(), key=lambda item: item[1], reverse=True)[:10]]
prec_res

['gymnodiscus',
 'floret',
 'recognis',
 'speci',
 'differ',
 'leaf',
 'morpholog',
 'number',
 'involucr',
 'bract']

In [7]:
# sort per centrality desc
sorted_pagerank = sorted(pagerank_res.items(), key=lambda item: item[1], reverse=True)
pagerank_nodes = [node[0] for node in sorted_pagerank]
for i in [10, 15, 20, 50, 100]: # ho messo valori a caso
    if len(sorted_pagerank) >= i:
        p_at_i, r_at_i = commons.scores.precision_recall(keywords, pagerank_nodes[:i])
        print(f'P@{i}: {p_at_i}; R@{i}: {r_at_i}')
    else:
        p_at_len, r_at_len = commons.scores.precision_recall(keywords, pagerank_nodes)
        print(f'P@{len(sorted_pagerank)}: {p_at_len}; R@{len(sorted_pagerank)}: {r_at_len}')
        break

P@10: 0.0; R@10: 0.0
P@15: 0.0; R@15: 0.0
P@20: 0.05; R@20: 0.14285714285714285
P@45: 0.06666666666666667; R@45: 0.42857142857142855


In [27]:
sorted_pagerank

[('gymnodiscus', 0.054595082022899026),
 ('floret', 0.04275080930792535),
 ('recognis', 0.0249730423188773),
 ('speci', 0.0249730423188773),
 ('differ', 0.0249730423188773),
 ('leaf', 0.0249730423188773),
 ('morpholog', 0.0249730423188773),
 ('number', 0.0249730423188773),
 ('involucr', 0.0249730423188773),
 ('bract', 0.0249730423188773),
 ('shape', 0.0249730423188773),
 ('ray', 0.0249730423188773),
 ('cypsela', 0.0249730423188773),
 ('vestitur', 0.0249730423188773),
 ('comprehens', 0.0221483942414175),
 ('taxonom', 0.0221483942414175),
 ('treatment', 0.0221483942414175),
 ('present', 0.0221483942414175),
 ('includ', 0.0221483942414175),
 ('descript', 0.0221483942414175),
 ('complet', 0.0221483942414175),
 ('nomenclatur', 0.0221483942414175),
 ('typif', 0.0221483942414175),
 ('illustr', 0.0221483942414175),
 ('geograph', 0.0221483942414175),
 ('distribut', 0.0221483942414175),
 ('senecionea', 0.020854409057866653),
 ('othonnina', 0.020854409057866653),
 ('distinguish', 0.02085440905786

In [21]:
list(articles[11].graph.nodes())

['south',
 'african',
 'genus',
 'gymnodiscus',
 'less',
 'treat',
 'synonym',
 'capillari',
 'l',
 'senecionea',
 'othonnina',
 'distinguish',
 'genera',
 'subtrib',
 'annual',
 'habit',
 'lack',
 'pappus',
 'disc',
 'floret',
 'recognis',
 'speci',
 'differ',
 'leaf',
 'morpholog',
 'number',
 'involucr',
 'bract',
 'shape',
 'ray',
 'cypsela',
 'vestitur',
 'comprehens',
 'taxonom',
 'treatment',
 'present',
 'includ',
 'descript',
 'complet',
 'nomenclatur',
 'typif',
 'illustr',
 'geograph',
 'distribut',
 'sch',
 'bip']

In [19]:
articles[11].kw

['new', 'synonym', 'nomenclatur', 'othonna', 'southern', 'africa', 'taxonomi']

In [22]:
commons.scores.precision_recall(articles[11].kw, list(articles[11].graph.nodes()))

(0.043478260869565216, 0.2857142857142857)

In [29]:
stemmer = gm.stemmer
type(stemmer), stemmer.stem('taxonomic'), stemmer.stem('taxonomy')

(nltk.stem.snowball.SnowballStemmer, 'taxonom', 'taxonomi')

In [30]:
stemmer = commons.graph.GraphMaker('resources/longStopwords.txt','POR').stemmer
type(stemmer), stemmer.stem('taxonomic'), stemmer.stem('taxonomy')

(nltk.stem.porter.PorterStemmer, 'taxonom', 'taxonomi')

In [34]:
stemmer = nltk.stem.LancasterStemmer()
type(stemmer), stemmer.stem('taxonomic'), stemmer.stem('taxonomy')

(nltk.stem.lancaster.LancasterStemmer, 'taxonom', 'taxonom')