# Automated Classification of Materials Datasets with NLP

In [120]:
import json
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET

Extracting abstract and origin from the json data

In [178]:
DATA_LOCATION = "/home/aswathy/src/MPCS/Winter_18/Practicum/data/mrr-dump-4Jan18.json"

In [115]:
#reading in json file
with open(DATA_LOCATION, "r") as fin:
    json_content = json.load(fin)
    
#Namespace
ns = {'nist' : 'http://schema.nist.gov/xml/res-md/1.0wd-02-2017'}
abstracts = []
onehot_sims = []
onehot_exp = []
onehot_info = []

for json_item in json_content:
    xmlstr = json_item['content']
    root = ET.fromstring(xmlstr)
    applicability = root.find('nist:applicability', namespaces=ns)
    
    #if no origin then throw away
    if (applicability == None):
        continue
        
    content = root.find('nist:content', namespaces=ns)    
    desc = content.findtext('nist:description', namespaces=ns)
    applicability = root.find('nist:applicability', namespaces=ns)
    if (applicability == None):
        continue
        
    dataOrigin = applicability.find('nist:dataOrigin', namespaces=ns)
    simulations = dataOrigin.find('nist:simulations', namespaces=ns)
    experiments = dataOrigin.find('nist:experiments', namespaces=ns)
    informatics = dataOrigin.find('nist:informatics_and_data_science', namespaces=ns)
    
    if simulations != None :
        onehot_sims.append(1)
    else:
        onehot_sims.append(0)
        
    if experiments != None :
        onehot_exp.append(1)
    else:
        onehot_exp.append(0)
        
    if informatics != None :
        onehot_info.append(1)
    else:
        onehot_info.append(0)
        
    abstracts.append(desc.lower())


In [116]:
df = pd.DataFrame({"Abstract" : abstracts,"Experiments" : onehot_exp, "Simulations" : onehot_sims, "Informatics" : onehot_info})              
df = df[(df.Experiments == 1) | (df.Simulations == 1) | (df.Informatics == 1)]
len(df)

73

In [117]:
df.head()

Unnamed: 0,Abstract,Experiments,Informatics,Simulations
0,simulation outputs for a total of more than 23...,0,0,1
1,we demonstrate automated generation of diffusi...,0,0,1
2,a database has been constructed that contains ...,1,0,0
3,long-standing challenges in cluster expansion ...,0,0,1
4,this notebook makes use of the pymatgen packag...,0,0,1


In [142]:
def accuracy(df):
    return (sum(df.Experiments == df.pred_exp) + sum(df.Simulations == df.pred_sim) + sum(df.Informatics == df.pred_info)) / (len(df) * 3)

## Random Model

- Randomly assigns a label if a random number generates >0.5 value for each label
- Accuracy around 0.5 for >10 iterations 

In [173]:
avg_acc = 0.0
iterations = 100

for i in range(iterations):
    df['pred_exp'] = (np.random.rand(len(df)) > 0.5).astype(int)
    df['pred_sim'] = (np.random.rand(len(df)) > 0.5).astype(int)
    df['pred_info'] = (np.random.rand(len(df)) > 0.5).astype(int)
    
    avg_acc += accuracy(df)
avg_acc /= iterations
avg_acc

0.49776255707762557

## Naive Model

- Checks whether the class label is explicilty present in the text
- If actual one-hot encoded value (for each class) is same as predicted one-hot value, then a point is awarded

In [176]:
df['pred_exp'] = df.Abstract.str.contains('experiment').astype(int)
df['pred_sim'] = df.Abstract.str.contains('simulation').astype(int)
df['pred_info'] = df.Abstract.str.contains('informatics').astype(int)

df.head()

Unnamed: 0,Abstract,Experiments,Informatics,Simulations,pred_exp,pred_sim,pred_info
0,simulation outputs for a total of more than 23...,0,0,1,0,1,0
1,we demonstrate automated generation of diffusi...,0,0,1,1,0,0
2,a database has been constructed that contains ...,1,0,0,0,0,0
3,long-standing challenges in cluster expansion ...,0,0,1,0,0,0
4,this notebook makes use of the pymatgen packag...,0,0,1,0,0,0


In [177]:
accuracy(df)

0.684931506849315

0.5