In [37]:
############
#Data preparation
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from keras.preprocessing import sequence
import itertools
import numpy as np
from sklearn.model_selection import train_test_split

#define constant variables
vocabulary_size = 8000
#unknown_token = "UNKNOWN_TOKEN" //defined in reduce_words_num_in_tokenized_abstracts function
file_name='first1k.txt'

df = pd.read_json(path_or_buf=file_name, typ='frame', lines=True)
df_reduct = df[["abstract","keywords"]]

#delete studies without abstract or keywords
df_reduct=df_reduct[df_reduct.abstract.notna() & df_reduct.keywords.notna()] #inkáb loc-ot kellene használni majd

df_reduct.reset_index(drop=True, inplace=True)

#lowercase, remove sepcial characters and numbers (numbers might be needed), (optional: stemming, stopword removal)
df_reduct.abstract=prepare_abstracts_for_tokenization(df_reduct.abstract)

#Tokenize the abstracts
df_reduct['tokenized_abstracts'] = [nltk.word_tokenize(sent) for sent in df_reduct.abstract]

#Get the most frequent words and replace others with unknown_token
df_reduct['tokenized_abstracts']=reduce_words_num_in_tokenized_abstracts(df_reduct.tokenized_abstracts, vocabulary_size)

#DEPRECATED!!!
##split rekords by keywords: deprecated, one should handle more keywords as more label
df_splitted=split_by_keywords(df_reduct,"keywords")

word_set=set().union(*df_reduct.tokenized_abstracts)
words_with_index = dict([(w,i) for i,w in enumerate(word_set)])

#Replace words with numbers
X_train = np.asarray([[words_with_index[w] for w in sent] for sent in df_splitted.tokenized_abstracts])

#max_num of words in abstract
max_num = len(max(df_splitted.tokenized_abstracts, key=len))

#pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_num)

############
#Label preparation

#Get word set
y_vocab = df_splitted.keywords.drop_duplicates()

#Assign index to word
y_index_to_word = [x for x in y_vocab]
#y_index_to_word.append(unknown_token)

y_word_to_index = dict([(w,i) for i,w in enumerate(y_index_to_word)])

y_train = np.asarray([y_word_to_index[str(w)] for w in df_splitted.keywords] )


############
#Split data set to train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.4, random_state=0)

In [5]:
############
#Create modell
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense
from keras.layers import LSTM

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vecor_length, input_length=max_num))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

In [25]:
############
#Evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [36]:
##################
##Functions

def prepare_abstracts_for_tokenization(abstracts):
    #lower case, remove special characters
    abstracts=list(map(lambda x:x.lower(),abstracts))
    abstracts=list(map(lambda x:re.sub('[^a-zA-Z ]', ' ', x),abstracts)) #itt kellhetnek a számok is esetleg bizonos spec karakterek is? 
    abstracts=list(map(lambda x:re.sub('  ', ' ', x),abstracts))

    #stemming?

    #remove stopwords?
    
    return abstracts

def reduce_words_num_in_tokenized_abstracts(tokenized_abstracts,vocabulary_size):
    ##select top n words and replace unknown words with unknown token
    #Count word frequency
    
    unknown_token='UNKNOWN_TOKEN'
    
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_abstracts))

    #Get the 8000 most common words
    vocab = word_freq.most_common(vocabulary_size-1)
    word_set = [x[0] for x in vocab]
    word_set.append(unknown_token)

    #Replace words missing from word_to_index with unknown token
    for i, sent in enumerate(tokenized_abstracts):
        tokenized_abstracts[i] = [w if w in word_set else unknown_token for w in sent] #unknown_token to parameter?
    
    return tokenized_abstracts

#Deprecated
def split_by_keywords(data_frame,columnname):
    df_splitted = pd.DataFrame(columns=data_frame.columns)
    for i, element in data_frame.iterrows():
        for key_word in element.keywords:
            df_splitted=df_splitted.append(element, ignore_index=True)
            df_splitted.iloc[-1, df_splitted.columns.get_loc(columnname)] = key_word
    return df_splitted

In [4]:
#Word2vector example
import pandas as pd
import gensim
import nltk

file_name='first1k.txt'

#read file
df = pd.read_json(path_or_buf=file_name, typ='frame', lines=True)
df = df[["abstract","title","lang"]]

#keep only english studies
df=df[df.lang=="en"]

#remove studies without abstract or title
df=df.loc[df.abstract.notna() & df.title.notna()] #inkáb loc-ot kellene használni majd

#reset indexes
df.reset_index(drop=True, inplace=True)

#lowercase, remove special characters
df.abstract=df.loc[:,'abstract'].str.lower().str.replace('[^a-zA-Z ]', ' ').str.replace('  ', ' ')

#tokenize
df.abstract=[nltk.word_tokenize(sent) for sent in df.abstract]

#apply vectors
model = gensim.models.Word2Vec(df.abstract, size=10, window=5)

print(model.wv['text'])
model.wv.most_similar("combination")

[-0.11415748  0.15742187  0.16901349  0.14381915  0.3512613   0.17911656
  0.06045374 -0.15670416  0.31205043  0.16865592]


[('region', 0.9974913001060486),
 ('day', 0.9972256422042847),
 ('attitudes', 0.9969133734703064),
 ('examination', 0.9969038963317871),
 ('showing', 0.9968945384025574),
 ('right', 0.9965606927871704),
 ('terms', 0.9964362978935242),
 ('lnm', 0.9964209198951721),
 ('regions', 0.9963256120681763),
 ('frequency', 0.9961439371109009)]

In [3]:
#word embedding, one integer per word plus padding example
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

file_name='first1k.txt'

#read file
df = pd.read_json(path_or_buf=file_name, typ='frame', lines=True)
df = df[["abstract","title","lang"]]

#keep only english studies
df=df[df.lang=="en"]

#remove studies without abstract or title
df=df[df.abstract.notna() & df.title.notna()] #inkáb loc-ot kellene használni majd

#reset indexes
df.reset_index(drop=True, inplace=True)


#vocab size: one should check vocab sizes of real data and change this value based on that
vocab_size = 8000

#assign numbers to words, in case of smaller vocab_size than the actual number of words, some words will have the
#same number
encoded_docs = [one_hot(d, vocab_size) for d in df.abstract]

#length of the longest abstract
max_length = len(max(encoded_docs, key=len))

#fill shorter sentences with 0-s (RNNs need that, other models might not need it)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='pre')

print(padded_docs)

[[   0    0    0 ... 4283 6361 2521]
 [   0    0    0 ... 6148 5863 2503]
 [   0    0    0 ... 6011 4379 1499]
 ...
 [   0    0    0 ... 2517 7177 4591]
 [   0    0    0 ... 5490 2200 7790]
 [   0    0    0 ... 3122 4060 5291]]
