In [2]:
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary

In [3]:
input_file = "./data/arxiv_results-1000.pkl"
df = pd.read_pickle(input_file)
df

Unnamed: 0,data,label,ceramic,polymer,semiconductor,metal,organometallic,experiment,simulation
0,[ZnO-based scintillation ceramics for applicat...,"(ceramic, experiment, material)",1,0,0,0,0,1,0
1,[A three-dimensional lattice of the Josephson ...,"(ceramic, simulation, material)",1,0,0,0,0,0,1
2,[Single polymer dynamics offers a powerful app...,"(polymer, experiment, material)",0,1,0,0,0,1,0
3,[Polymer chains in colloid-polymer mixtures ca...,"(polymer, simulation, material)",0,1,0,0,0,0,1
4,[The utilisation of semiconductor materials as...,"(semiconductor, experiment, material)",0,0,1,0,0,1,0
5,[We propose a method to realize diluted magnet...,"(semiconductor, simulation, material)",0,0,1,0,0,0,1
6,[Half-metallic ferromagnetism (HMFM) occurs ra...,"(metal, experiment, material)",0,0,0,1,0,1,0
7,"[The room temperature structure of aluminum, c...","(metal, simulation, material)",0,0,0,1,0,0,1
8,[We present a detailed analysis of the results...,"(organometallic, experiment, material)",0,0,0,0,1,1,0
9,[The interaction of trimethyl methylcyclopenta...,"(organometallic, simulation, material)",0,0,0,0,1,0,1


# Helper Functions

In [4]:
# Creates a new row for each value in a list in col_name. Maintains other columns
def explode_column(df, col_name):
    s = df.apply(lambda x: pd.Series(x[str(col_name)]),axis=1).stack().reset_index(level=1, drop=True)
    s.name = str(col_name)
    df = df.drop(str(col_name), axis=1).join(s)
    return df

In [5]:
y_cols = ["ceramic","polymer","semiconductor","metal","organometallic","experiment","simulation"]
x_cols = ["data"]

In [6]:
# # Separate all rows into individual abstract --> label
# dataset = []
# for i in range(0, len(df)):
#     for abstract in df.iloc[i]['data']:
#         dataset.append({"data":abstract.lower(), 
#                         "labels":df.iloc[i][y_cols].values})
    

# Reshape the Dataframe

In [7]:
df = explode_column(df, 'data')
df['data'] = df['data'][0:].str.lower()
len(df)

3770

# Generate a Gensim Corpus and Dictionary (word-->id mapping)

In [8]:
no_below = 5 # Word must appear at least 5 different docs
no_above  = 0.75 # Word must not appear in at least float percent of docs

corpus = df['data'].tolist()
split_corpus = [abstract.split(' ') for abstract in corpus]
dct = Dictionary(split_corpus)
dct.filter_extremes(no_below=no_below, no_above=no_above)

# Words to ID 

In [9]:
n_words = len(dct.iteritems())
n_words

7507

In [13]:
max_len = 150

def split_id(x):
    x = x.split(' ')
    x = dct.doc2idx(x)
    return x

df['data_id'] = df['data'][0:].map(split_id)
df['data_trunc'] = df['data_id'].map(lambda x: np.array(x[0:max_len]).clip(0,100000))


In [14]:
n_outputs = len(y_cols)
X = df['data_trunc'].as_matrix()
y = df[y_cols]

In [15]:
df.head()

Unnamed: 0,label,ceramic,polymer,semiconductor,metal,organometallic,experiment,simulation,data,data_id,data_trunc
0,"(ceramic, experiment, material)",1,0,0,0,0,1,0,zno-based scintillation ceramics for applicati...,"[-1, 52, 14, 27, 6, -1, -1, -1, -1, 31, 10, 33...","[0, 52, 14, 27, 6, 0, 0, 0, 0, 31, 10, 33, 0, ..."
0,"(ceramic, experiment, material)",1,0,0,0,0,1,0,the no{\nu}a experiment will construct a detec...,"[-1, -1, 102, 154, 87, -1, 91, 127, 27, 97, 11...","[0, 0, 102, 154, 87, 0, 91, 127, 27, 97, 119, ..."
0,"(ceramic, experiment, material)",1,0,0,0,0,1,0,a three-dimensional lattice of the josephson j...,"[-1, 197, 178, -1, -1, 176, 177, -1, -1, 173, ...","[0, 197, 178, 0, 0, 176, 177, 0, 0, 173, 0, 0,..."
0,"(ceramic, experiment, material)",1,0,0,0,0,1,0,the experiment with weighing pzt-piezoelectric...,"[-1, 102, -1, -1, -1, 204, 213, 146, 160, -1, ...","[0, 102, 0, 0, 0, 204, 213, 146, 160, 0, 214, ..."
0,"(ceramic, experiment, material)",1,0,0,0,0,1,0,ceramics zno:zn of 20mm diameter and 1.6mm thi...,"[14, -1, -1, -1, 243, -1, -1, 265, -1, 4, 44, ...","[14, 0, 0, 0, 243, 0, 0, 265, 0, 4, 44, 61, 14..."


In [16]:
from keras.preprocessing import sequence

X = sequence.pad_sequences(X, maxlen=max_len)

Using TensorFlow backend.


In [17]:
X.shape

(3770, 150)

In [18]:
y.shape

(3770, 7)

In [19]:
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras import metrics

In [20]:
# Using embedding from Keras
embedding_length = 100
model = Sequential()
model.add(Embedding(n_words, embedding_length, input_length=max_len))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(63, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.15))
model.add(Dense(180,activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(n_outputs,activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[metrics.categorical_accuracy])

# Tests to make sure our data are shaped correctly

In [21]:
assert(X.shape[1] == max_len)
assert(y.shape[1] == n_outputs)
assert(X.shape[0] == y.shape[0])

In [22]:
model.fit(X, y.as_matrix(), epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1226b9630>

# Spot check Predictions

In [45]:
index = 3000

print(df.iloc[index]['data'])
print()
print("=== Pseudo Label ===")
print(df.iloc[index][y_cols])
print()
print("=== Prediction ===")
pred = zip(model.predict(X[index:index+1])[0], y_cols)
for item in pred:
    print("{} == {:0.4f}".format(item[1],item[0]))

we investigate, by molecular dynamics simulation, the generic features associated with the dynamic compaction of metallic nano-foams at very high strain rates. a universal feature of the dynamic compaction process is revealed as composed of two distinct regions: a growing crushed region and a leading fluid precursor. the crushed region has a density lower than the solid material and gradually grows thicker in time by {\it snowplowing}. the trapped fluid precursor is created by ablation and/or melting of the foam filaments and the subsequent confinement of the hot atoms in a region comparable to the filament length of the foam. quantitative characterization of nano-foam compaction dynamics is presented and the compacted form equation-of-state is discussed. we argue that high-energy foam crushing is not a shock phenomenon even though both share the snowplow feature.

=== Pseudo Label ===
ceramic           0
polymer           0
semiconductor     0
metal             1
organometallic    0
e

# Try on Abstracts outside Training Set

In [36]:
def process_input(text, max_len=max_len):
    from keras.preprocessing import sequence
    x = text.split(' ')
    x = dct.doc2idx(x)
    x = x[0:max_len]
    x = sequence.pad_sequences([x], maxlen=max_len)
    return x

In [37]:
test = """
A polymer melt is simulated at finite temperature by the Monte-Carlo method. We use a coarse-grained model for the polymer system, the bond-fluctuation model. Static properties of the melt can be obtained by generating configurations not with single-monomer-dynamics which moves individual monomers locally, but reptation-dynamics which allows collective motion of the chains. This algorithm can produce equilibrated configurations much faster. It is demonstrated that static properties do not differ from those obtained by single-monomer-dynamics. Values of the radius of gyration, the mean square bond length and similar quantities for different temperatures and densities are presented.
"""

In [38]:
print("=== Prediction ===")
X_p = process_input(test)
X_p = X_p.clip(0,30000)
pred = zip(model.predict(X_p)[0], y_cols)
for item in pred:
    print("{} == {:0.4f}".format(item[1],item[0]))


=== Prediction ===
ceramic == 0.0009
polymer == 0.9984
semiconductor == 0.0000
metal == 0.0000
organometallic == 0.0000
experiment == 0.0053
simulation == 0.9954
