# Automated Classification of Materials Datasets with NLP

In [1]:
import re
import json
import spacy
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from xml.etree import ElementTree as ET
from sklearn.model_selection import train_test_split

Extracting abstract and origin from the json data

In [2]:
DATA_LOCATION = "/home/aswathy/src/MPCS/Winter_18/Practicum/Practicum-MDF/data/mrr-dump-4Jan18.json"

In [3]:
#reading in json file
with open(DATA_LOCATION, "r") as fin:
    json_content = json.load(fin)
    
#Namespace
ns = {'nist' : 'http://schema.nist.gov/xml/res-md/1.0wd-02-2017'}
abstracts = []
onehot_sims = []
onehot_exp = []
onehot_info = []

for json_item in json_content:
    schema = json_item["schema"]
    if schema != "599de0fdbe2d44172d372533":
        continue
    xmlstr = json_item['content']
    root = ET.fromstring(xmlstr)
    applicability = root.find('nist:applicability', namespaces=ns)
    #if no origin then throw away
    if ((root.attrib['status'] == "deleted") or (applicability == None)):
        continue
        
    content = root.find('nist:content', namespaces=ns)    
    desc = content.findtext('nist:description', namespaces=ns)
    
    if desc == '':
        continue 
    
    dataOrigin = applicability.findall('nist:dataOrigin', namespaces=ns)


    is_sim = False
    is_exp = False
    is_info = False

    for dOrigin in dataOrigin:
        simulations = dOrigin.find('nist:simulations', namespaces=ns)
        if simulations != None :
            is_sim = True
            
        experiments = dOrigin.find('nist:experiments', namespaces=ns)
        if experiments != None :
            is_exp = True
            
        informatics = dOrigin.find('nist:informatics_and_data_science', namespaces=ns)
        if informatics != None :
            is_info = True
    
    if is_sim:
        onehot_sims.append(1)
    else:
        onehot_sims.append(0)
        
    if is_exp:
        onehot_exp.append(1)
    else:
        onehot_exp.append(0)
        
    if is_info:
        onehot_info.append(1)
    else:
        onehot_info.append(0)
        
#     ' '.join(desc.split())
    abstracts.append(desc)


In [4]:
df = pd.DataFrame({"Abstract" : abstracts,"Experiments" : onehot_exp, "Simulations" : onehot_sims, "Informatics" : onehot_info})              
df = df[(df.Experiments == 1) | (df.Simulations == 1) | (df.Informatics == 1)]
len(df)

70

In [16]:
df.head()

Unnamed: 0,Abstract,Experiments,Informatics,Simulations
0,We demonstrate automated generation of diffusi...,0,0,1
1,Long-standing challenges in cluster expansion ...,0,0,1
2,This notebook makes use of the pymatgen packag...,0,0,1
3,Metal-organic frameworks (MOFs) are a class of...,0,0,1
4,Three oxidatively and hydrolytically stable Cp...,1,0,1


In [17]:
def accuracy():
    return (sum(df.Experiments == df.pred_exp) + sum(df.Simulations == df.pred_sim) + sum(df.Informatics == df.pred_info)) / (len(df) * 3)

def precision():
    TP = len(df[(df.Experiments == 1) & (df.pred_exp == 1)]) + len(df[(df.Simulations == 1) & (df.pred_sim == 1)]) + len(df[(df.Informatics == 1) & (df.pred_info == 1)])
    FP = len(df[(df.Experiments == 0) & (df.pred_exp == 1)]) + len(df[(df.Simulations == 0) & (df.pred_sim == 1)]) + len(df[(df.Informatics == 0) & (df.pred_info == 1)])
    return (TP / (TP + FP))

def recall():
    TP = len(df[(df.Experiments == 1) & (df.pred_exp == 1)]) + len(df[(df.Simulations == 1) & (df.pred_sim == 1)]) + len(df[(df.Informatics == 1) & (df.pred_info == 1)])
    FN = len(df[(df.Experiments == 1) & (df.pred_exp == 0)]) + len(df[(df.Simulations == 1) & (df.pred_sim == 0)]) + len(df[(df.Informatics == 1) & (df.pred_info == 0)])
    return (TP / (TP + FN))

def F_score(P, R):
    return (2*P*R / (P + R))

## Random Model

- Assigns 1 to simulations and experiments and 0 to informatics

In [18]:
avg_acc = 0.0
avg_prec = 0.0
avg_recall = 0.0

df['pred_exp'] = 1#(np.random.rand(len(df)) > 0.5).astype(int)
df['pred_sim'] = 1#(np.random.rand(len(df)) > 0.56).astype(int)
df['pred_info'] = 0#(np.random.rand(len(df)) > 0.5).astype(int)

avg_acc = accuracy()
avg_prec = precision()
avg_recall = recall() 

f_score = F_score(avg_prec, avg_recall)

print("Accuracy : " + str(avg_acc))
print("Precision : " + str(avg_prec))
print("Recall : " + str(avg_recall))
print("F-Score : " + str(f_score))

Accuracy : 0.6904761904761905
Precision : 0.6071428571428571
Recall : 0.8947368421052632
F-Score : 0.7234042553191489


## Naive Model

- Checks whether the class label is explicilty present in the text
- If actual one-hot encoded value (for each class) is same as predicted one-hot value, then a point is awarded

In [19]:
exp_features = ['experiment', 'spectr', 'sample', 'activ', 'measur', 'construct']
sim_features = ['simulat', 'demo', 'method', 'benchmark', 'dft', 'plot', 'machine learning', 'density']
info_features = ['informatics', 'comment', 'collection', 'compile']

df['pred_exp'] = df.Abstract.str.lower().str.contains('|'.join(exp_features)).astype(int)
df['pred_sim'] = df.Abstract.str.lower().str.contains('|'.join(sim_features)).astype(int)
df['pred_info'] = df.Abstract.str.lower().str.contains('|'.join(info_features)).astype(int)

df.head()

Unnamed: 0,Abstract,Experiments,Informatics,Simulations,pred_exp,pred_sim,pred_info
0,We demonstrate automated generation of diffusi...,0,0,1,1,1,1
1,Long-standing challenges in cluster expansion ...,0,0,1,1,1,0
2,This notebook makes use of the pymatgen packag...,0,0,1,0,1,0
3,Metal-organic frameworks (MOFs) are a class of...,0,0,1,1,0,0
4,Three oxidatively and hydrolytically stable Cp...,1,0,1,1,0,0


In [20]:
print("Accuracy : " + str(accuracy()))
P = precision()
R = recall()
F = F_score(P,R)

print("Precision : " + str(P))
print("Recall : " + str(R))
print("F-Score : " + str(F))

Accuracy : 0.7761904761904762
Precision : 0.7727272727272727
Recall : 0.7157894736842105
F-Score : 0.7431693989071039


## Pre-processing
- Stop word removal
- Lemmatization
- Removed words that occur only once in hole document

In [21]:
nlp = spacy.load('en')

In [22]:
#Combining stopwords from nltk and spaCy
stopWordsUnion = list(set().union(stopwords.words('english'), nlp.Defaults.stop_words))

In [23]:
#List of stopwords
stopWordsUnion

['haven',
 'once',
 'seeming',
 'ten',
 't',
 'else',
 'now',
 'am',
 'make',
 'will',
 'eight',
 'up',
 'is',
 'often',
 'nor',
 'so',
 'your',
 'as',
 'itself',
 'go',
 'onto',
 'against',
 'others',
 'serious',
 'hers',
 'all',
 'fifteen',
 'us',
 'toward',
 'twenty',
 'thru',
 'through',
 'ours',
 'further',
 'herself',
 'amongst',
 'since',
 'such',
 'anywhere',
 'few',
 'last',
 'somehow',
 'least',
 'too',
 'shan',
 'because',
 'whatever',
 'wherein',
 'quite',
 'regarding',
 'each',
 'whole',
 'themselves',
 'was',
 'noone',
 'has',
 'shouldn',
 'thence',
 'are',
 'rather',
 'everyone',
 'elsewhere',
 'everywhere',
 'hence',
 'seems',
 'than',
 'that',
 'its',
 'ca',
 'due',
 'i',
 'hereby',
 'one',
 'of',
 'if',
 'whose',
 'you',
 'didn',
 'had',
 'take',
 'sometime',
 'ain',
 'among',
 'hasn',
 'therefore',
 'how',
 'while',
 'o',
 'done',
 'give',
 'above',
 'across',
 'via',
 'll',
 'them',
 'although',
 'just',
 'several',
 'for',
 'been',
 'next',
 'they',
 'theirs',
 'ni

In [24]:
import regex as re
from spacy.tokenizer import Tokenizer

prefix_re = re.compile(r'''^[\[\("']''')
suffix_re = re.compile(r'''[\]\),."']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return spacy.tokenizer.Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp)

In [25]:
vocab_freq = {}
abstract_space = []
for abs_string in df.Abstract:   
    doc = nlp(abs_string) 
    
    words_filtered = [str(w.lemma_).lower() for w in doc if (not str(w).lower() in stopWordsUnion and str(w).isalpha())]
    for i in range(len(words_filtered)):
        if words_filtered[i] in vocab_freq.keys():
            vocab_freq[words_filtered[i]] += 1
        else:
            vocab_freq[words_filtered[i]] = 1
    abstract_space.append(words_filtered)
    
for abstract in abstract_space:
    for word in abstract:
        if vocab_freq[word] < 2:
            abstract.remove(word)

In [26]:
preprocessed_astracts = []
for abstract in abstract_space:
    preprocessed_astracts.append(' '.join(abstract))

In [27]:
preprocessed_astracts[1]

'long challenge cluster expansion ce construction include truncate expansion crystal structure use training compressive cs powerful tool model construction physics provide rigorous framework address challenge develop bayesian implementation cs bcs provide framework speed current ce construction technique error estimate model coefficient demonstrate use bcs cluster expansion model binary alloy system speed method accuracy result fit show far state evolutionary method alloy system show combine high throughput principle framework bcs lattice model construct way high throughput thermodynamic modeling alloy grateful financial support nsf dmr grateful financial support nsf gratefully financial support nsf dmr use compute resource national energy research scientific center support doe contract de support doe contract de'

In [28]:
df.Abstract[1]

'Long-standing challenges in cluster expansion (CE) construction include choosing how to truncate the expansion and which crystal structures to use for training. Compressive sensing (CS), which is emerging as a powerful tool for model construction in physics, provides a mathematically rigorous framework for addressing these challenges. A recently-developed Bayesian implementation of CS (BCS) provides a parameterless framework, a vast speed-up over current CE construction techniques, and error estimates on model coefficients. Here, we demonstrate the use of BCS to build cluster expansion models for several binary alloy systems. The speed of the method and the accuracy of the resulting fits are shown to be far superior than state-of-the-art evolutionary methods for all alloy systems shown. When combined with high-throughput first-principles frameworks, the implications of BCS are that hundreds of lattice models can be automatically constructed, paving the way to high-throughput thermodyn

## Creating files for fasttext

In [29]:
prepped_df = df.loc[:, ['Abstract', 'Experiments', 'Simulations', 'Informatics']]
prepped_df.loc[:, 'Abstract'] = preprocessed_astracts

prepped_df.loc[prepped_df.loc[:,'Experiments'] == 1, 'Experiments'] = '__label__1'
prepped_df.loc[prepped_df.loc[:,'Experiments'] == 0, 'Experiments'] = '__label__0'

prepped_df.loc[prepped_df.loc[:,'Simulations'] == 1, 'Simulations'] = '__label__1'
prepped_df.loc[prepped_df.loc[:,'Simulations'] == 0, 'Simulations'] = '__label__0'

prepped_df.loc[prepped_df.loc[:,'Informatics'] == 1, 'Informatics'] = '__label__1'
prepped_df.loc[prepped_df.loc[:,'Informatics'] == 0, 'Informatics'] = '__label__0'

In [30]:
train, test = train_test_split(prepped_df, test_size=0.2)

exp_train = train.loc[:,['Experiments', 'Abstract']]
exp_test = test.loc[:,['Abstract']]

sim_train = train.loc[:,['Simulations', 'Abstract']]
sim_test = test.loc[:,['Abstract']]

info_train = train.loc[:,['Informatics', 'Abstract']]
info_test = test.loc[:,['Abstract']]

Saving files 

In [31]:
exp_train.to_csv('../data/fastText-0.1.0/exp_train.txt', sep = " ", header = False, index=False)
exp_test.to_csv('../data/fastText-0.1.0/exp_test.txt', sep = " ", header = False, index=False)

sim_train.to_csv('../data/fastText-0.1.0/sim_train.txt', sep = " ", header = False, index=False)
sim_test.to_csv('../data/fastText-0.1.0/sim_test.txt', sep = " ", header = False, index=False)

info_train.to_csv('../data/fastText-0.1.0/info_train.txt', sep = " ", header = False, index=False)
info_test.to_csv('../data/fastText-0.1.0/info_test.txt', sep = " ", header = False, index=False)

In [32]:
exp_train

Unnamed: 0,Experiments,Abstract
60,__label__1,collection x ray absorption spectra datum good...
42,__label__0,research data support linear scale density fun...
44,__label__1,family bridge aggregate construct cycloplatina...
10,__label__0,benchmark molecule introduce test implementati...
22,__label__1,release number partially dataset use dataset i...
3,__label__0,metal organic framework mof class nanoporous c...
55,__label__0,datum file correspond publication list file in...
36,__label__0,oqmd database dft calculate thermodynamic stru...
43,__label__1,platinum acetate blue pab empirical byproduct ...
65,__label__0,segregation change cohesion mechanical propert...


In [33]:
exp_test

Unnamed: 0,Abstract
51,provide example temperature calculation solid ...
15,pseudopotential file ge h format calculation i...
66,datum set contain original xps datum collect s...
47,datum set consist molecular dynamic trajectory...
5,welcome coherent x ray imaging data cxidb new ...
4,hydrolytically stable complex synthesize fully...
8,nanostructure form phase separation improve th...
48,new heptanuclear complex prepare attach terpyr...
64,nanocluster relevance catalysis sensor fundame...
50,information publication calculation second ord...


In [34]:
import subprocess

In [35]:
subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input exp_train.txt -output model_exp -epoch 100 -lr 0.5; ./fasttext predict-prob model_exp.bin exp_test.txt > exp.out", shell=True, check=True, stdout=subprocess.PIPE)

CompletedProcess(args='cd ../data/fastText-0.1.0/; ./fasttext supervised -input exp_train.txt -output model_exp -epoch 100 -lr 0.5; ./fasttext predict-prob model_exp.bin exp_test.txt > exp.out', returncode=0, stdout=b'')

In [36]:
subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input sim_train.txt -output model_sim -epoch 100 -lr 0.5; ./fasttext predict-prob model_sim.bin sim_test.txt > sim.out", shell=True, check=True, stdout=subprocess.PIPE)

CompletedProcess(args='cd ../data/fastText-0.1.0/; ./fasttext supervised -input sim_train.txt -output model_sim -epoch 100 -lr 0.5; ./fasttext predict-prob model_sim.bin sim_test.txt > sim.out', returncode=0, stdout=b'')

In [37]:
subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input info_train.txt -output model_info -epoch 100 -lr 0.5; ./fasttext predict-prob model_info.bin info_test.txt > info.out", shell=True, check=True, stdout=subprocess.PIPE)

CompletedProcess(args='cd ../data/fastText-0.1.0/; ./fasttext supervised -input info_train.txt -output model_info -epoch 100 -lr 0.5; ./fasttext predict-prob model_info.bin info_test.txt > info.out', returncode=0, stdout=b'')

In [38]:
pred_exp = []
with open('../data/fastText-0.1.0/exp.out', "r") as exp_out:
    lines = exp_out.readlines()
    for line in lines:
        label = line.split()[0]
        pred_exp.append(label)

test = test.assign(pred_exp=pred_exp)

In [39]:
pred_sim = []
with open('../data/fastText-0.1.0/sim.out', "r") as sim_out:
    lines = sim_out.readlines()
    for line in lines:
        label = line.split()[0]
        pred_sim.append(label)

test = test.assign(pred_sim=pred_sim)

In [40]:
pred_info = []
with open('../data/fastText-0.1.0/info.out', "r") as info_out:
    lines = info_out.readlines()
    for line in lines:
        label = line.split()[0]
        pred_info.append(label)

test = test.assign(pred_info=pred_info)

In [41]:
test

Unnamed: 0,Abstract,Experiments,Simulations,Informatics,pred_exp,pred_sim,pred_info
51,provide example temperature calculation solid ...,__label__0,__label__1,__label__0,__label__1,__label__1,__label__0
15,pseudopotential file ge h format calculation i...,__label__0,__label__1,__label__0,__label__0,__label__1,__label__0
66,datum set contain original xps datum collect s...,__label__1,__label__1,__label__0,__label__1,__label__0,__label__0
47,datum set consist molecular dynamic trajectory...,__label__0,__label__1,__label__0,__label__1,__label__1,__label__0
5,welcome coherent x ray imaging data cxidb new ...,__label__1,__label__0,__label__0,__label__1,__label__0,__label__0
4,hydrolytically stable complex synthesize fully...,__label__1,__label__1,__label__0,__label__0,__label__1,__label__0
8,nanostructure form phase separation improve th...,__label__0,__label__1,__label__0,__label__0,__label__1,__label__0
48,new heptanuclear complex prepare attach terpyr...,__label__1,__label__1,__label__0,__label__1,__label__1,__label__0
64,nanocluster relevance catalysis sensor fundame...,__label__0,__label__1,__label__0,__label__0,__label__1,__label__0
50,information publication calculation second ord...,__label__0,__label__1,__label__0,__label__0,__label__1,__label__0


In [42]:
def accuracy():
    return (sum(test.Experiments == test.pred_exp) + sum(test.Simulations == test.pred_sim) + sum(test.Informatics == test.pred_info)) / (len(test) * 3)

def precision():
    TP = len(test[(test.Experiments == 1) & (test.pred_exp == 1)]) + len(test[(test.Simulations == 1) & (test.pred_sim == 1)]) + len(test[(test.Informatics == 1) & (test.pred_info == 1)])
    FP = len(test[(test.Experiments == 0) & (test.pred_exp == 1)]) + len(test[(test.Simulations == 0) & (test.pred_sim == 1)]) + len(test[(test.Informatics == 0) & (test.pred_info == 1)])
    return (TP / (TP + FP))

def recall():
    TP = len(test[(test.Experiments == 1) & (test.pred_exp == 1)]) + len(test[(test.Simulations == 1) & (test.pred_sim == 1)]) + len(test[(test.Informatics == 1) & (test.pred_info == 1)])
    FN = len(test[(test.Experiments == 1) & (test.pred_exp == 0)]) + len(test[(test.Simulations == 1) & (test.pred_sim == 0)]) + len(test[(test.Informatics == 1) & (test.pred_info == 0)])
    return (TP / (TP + FN))

def F_score(P, R):
    return (2*P*R / (P + R))

In [43]:
test.loc[test.loc[:,'Experiments'] == '__label__1', 'Experiments'] = 1
test.loc[test.loc[:,'Experiments'] == '__label__0', 'Experiments'] = 0

test.loc[test.loc[:,'Simulations'] == '__label__1', 'Simulations'] = 1
test.loc[test.loc[:,'Simulations'] == '__label__0', 'Simulations'] = 0

test.loc[test.loc[:,'Informatics'] == '__label__1', 'Informatics'] = 1
test.loc[test.loc[:,'Informatics'] == '__label__0', 'Informatics'] = 0

test.loc[test.loc[:,'pred_exp'] == '__label__1', 'pred_exp'] = 1
test.loc[test.loc[:,'pred_exp'] == '__label__0', 'pred_exp'] = 0

test.loc[test.loc[:,'pred_sim'] == '__label__1', 'pred_sim'] = 1
test.loc[test.loc[:,'pred_sim'] == '__label__0', 'pred_sim'] = 0

test.loc[test.loc[:,'pred_info'] == '__label__1', 'pred_info'] = 1
test.loc[test.loc[:,'pred_info'] == '__label__0', 'pred_info'] = 0

P = precision()
R = recall()
print("Accuracy : " + str(accuracy()))
print("Precision : " + str(P))
print("Recall : " + str(R))
print("F-Score : " + str(F_score(P, R)))

Accuracy : 0.8809523809523809
Precision : 0.8333333333333334
Recall : 0.8823529411764706
F-Score : 0.8571428571428571


In [44]:
iterations = 10
avg_acc = 0.0
avg_P = 0.0
avg_R = 0.0

for i in range(iterations):
    train, test = train_test_split(prepped_df, test_size=0.2)

    exp_train = train.loc[:,['Experiments', 'Abstract']]
    exp_test = test.loc[:,['Abstract']]

    sim_train = train.loc[:,['Simulations', 'Abstract']]
    sim_test = test.loc[:,['Abstract']]

    info_train = train.loc[:,['Informatics', 'Abstract']]
    info_test = test.loc[:,['Abstract']]
    
    exp_train.to_csv('../data/fastText-0.1.0/exp_train.txt', sep = " ", header = False, index=False)
    exp_test.to_csv('../data/fastText-0.1.0/exp_test.txt', sep = " ", header = False, index=False)

    sim_train.to_csv('../data/fastText-0.1.0/sim_train.txt', sep = " ", header = False, index=False)
    sim_test.to_csv('../data/fastText-0.1.0/sim_test.txt', sep = " ", header = False, index=False)

    info_train.to_csv('../data/fastText-0.1.0/info_train.txt', sep = " ", header = False, index=False)
    info_test.to_csv('../data/fastText-0.1.0/info_test.txt', sep = " ", header = False, index=False)
    
    subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input exp_train.txt -output model_exp -epoch 100; ./fasttext predict-prob model_exp.bin exp_test.txt > exp.out", shell=True, check=True, stdout=subprocess.PIPE)
    subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input sim_train.txt -output model_sim -epoch 100; ./fasttext predict-prob model_sim.bin sim_test.txt > sim.out", shell=True, check=True, stdout=subprocess.PIPE)
    subprocess.run("cd ../data/fastText-0.1.0/; ./fasttext supervised -input info_train.txt -output model_info -epoch 100; ./fasttext predict-prob model_info.bin info_test.txt > info.out", shell=True, check=True, stdout=subprocess.PIPE)
    
    pred_exp = []
    with open('../data/fastText-0.1.0/exp.out', "r") as exp_out:
        lines = exp_out.readlines()
        for line in lines:
            label = line.split()[0]
            pred_exp.append(label)

    test = test.assign(pred_exp=pred_exp)
    
    pred_sim = []
    with open('../data/fastText-0.1.0/sim.out', "r") as sim_out:
        lines = sim_out.readlines()
        for line in lines:
            label = line.split()[0]
            pred_sim.append(label)

    test = test.assign(pred_sim=pred_sim)
    
    pred_info = []
    with open('../data/fastText-0.1.0/info.out', "r") as info_out:
        lines = info_out.readlines()
        for line in lines:
            label = line.split()[0]
            pred_info.append(label)

    test = test.assign(pred_info=pred_info)
    test.loc[test.loc[:,'Experiments'] == '__label__1', 'Experiments'] = 1
    test.loc[test.loc[:,'Experiments'] == '__label__0', 'Experiments'] = 0

    test.loc[test.loc[:,'Simulations'] == '__label__1', 'Simulations'] = 1
    test.loc[test.loc[:,'Simulations'] == '__label__0', 'Simulations'] = 0

    test.loc[test.loc[:,'Informatics'] == '__label__1', 'Informatics'] = 1
    test.loc[test.loc[:,'Informatics'] == '__label__0', 'Informatics'] = 0

    test.loc[test.loc[:,'pred_exp'] == '__label__1', 'pred_exp'] = 1
    test.loc[test.loc[:,'pred_exp'] == '__label__0', 'pred_exp'] = 0

    test.loc[test.loc[:,'pred_sim'] == '__label__1', 'pred_sim'] = 1
    test.loc[test.loc[:,'pred_sim'] == '__label__0', 'pred_sim'] = 0

    test.loc[test.loc[:,'pred_info'] == '__label__1', 'pred_info'] = 1
    test.loc[test.loc[:,'pred_info'] == '__label__0', 'pred_info'] = 0

    avg_acc += accuracy()
    avg_P += precision()
    avg_R += recall()
    
avg_acc /= iterations
avg_P /= iterations
avg_R /= iterations
f_score = F_score(avg_P, avg_R)

print("Accuracy : " + str(avg_acc))
print("Precision : " + str(avg_P))
print("Recall : " + str(avg_R))
print("F-Score : " + str(f_score))


Accuracy : 0.8119047619047619
Precision : 0.7813503455608718
Recall : 0.8148030001474273
F-Score : 0.7977261175325395


In [45]:
with open('abstracts.txt', 'w') as f:
    for i in range(len(df.Abstract)):
        if i != 63:
            f.write(df.Abstract[i])

In [62]:
import copy
model_data = copy.deepcopy(test)
model_data.Abstract = df.Abstract[test.index]
model_data.to_csv("../data/model_data.csv")

In [90]:
model_data

Unnamed: 0,Abstract,Experiments,Simulations,Informatics,pred_exp,pred_sim,pred_info
29,NIST Standard Reference Database 13\nNIST JANA...,1,1,1,1,0,0
56,Resonant Ultrasound Spectroscopy data of a sam...,1,0,0,1,0,0
62,This dataset (and the added analysis software)...,0,0,1,1,0,0
55,These data files correspond to our publication...,0,1,0,0,1,0
54,Modeling oxygen interstitials in titanium requ...,0,1,0,0,1,0
42,"Research Data supporting ""Linear-Scaling Densi...",0,1,0,0,1,0
1,Long-standing challenges in cluster expansion ...,0,1,0,0,1,0
46,"DSC files, neutron diffraction data, hardness ...",1,1,0,1,1,0
66,This data set contains original XPS and NEXAFS...,1,1,0,1,1,0
60,This is a collection of X-ray Absorption Spect...,1,0,0,1,0,0


In [92]:
model_data.Abstract[24]

'Material Informatics for Polymer Nanocomposites\n\nA Nanocomposites Data Resource\nThis system allows for the registration of materials resources, bridging the gap between existing resources and the end users. NanoMine functions as a centrally located service, making the registered information available for research to the materials community.\nNSF supported the initial concept of NanoMine as a joint NU/RPI project through the CDMR program and is currently supporting a robust ontology and software framework via the DIBBS program. NIST/CHIMAD support data curation, tool development, demonstration materials design problems, and integration into the larger MGI network as well as provide backbone use of the Materials Data Curator system run by NIST.'

In [99]:
a = {"count" : 0, "score": 1}
b = a.copy()

In [117]:
str(u)

'851d0ab8-1cd0-11e8-a8c0-fcf8ae7354fb'

In [115]:
import uuid
import datetime
u = uuid.uuid1()

print(datetime.datetime.fromtimestamp((u.time - 0x01b21dd213814000)*100/1e9))

2018-02-28 15:44:14.684844
