In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import keras
import random
import re
import dask.bag as db
import json
import pandas as pd
import re
import nltk
nltk.download('stopwords')
import jieba as jb

In [None]:
def remove_punctuation(line):
    line = str(line)
    if line.strip() == '':
        return ''
    rule = re.compile("[^0-9a-zA-Z\s-]")
    line = rule.sub('', line).strip()
    return line

# select year for data set
def LSTM_2_select_year(year1, year2):

    # use dask.bag to load json file
    docs = db.read_text('/content/datasets/Cornell-University/arxiv/arxiv-metadata-oai-snapshot.json').map(json.loads)

    # Submissions by datetime
    get_year = lambda x: x['versions'][-1]['created'].split(' ')[3]

    # get only necessary fields
    trim = lambda x: {'id': x['id'],
                      'title': x['title'],
                      'category':x['categories'].split(' '),
                      'abstract':x['abstract'],
                      'time':x['versions'][-1]['created'].split(' ')[3]}

    # filter for papers published on or after ????-01-01
    docs_df = (docs.filter(lambda x: int(get_year(x)) > year1)
                   .filter(lambda x: int(get_year(x)) < year2)
                   .map(trim)
                   .compute())

    # convert to pandas dataframe
    docs_df = pd.DataFrame(docs_df)

    # add main category
    docs_df['main_category'] = docs_df.category.apply(lambda x:[a.split('.')[0] for a in x][0].split())
    
    # add main category 2 
    docs_df['main_category2'] = docs_df.main_category.apply(lambda x: x[0])

    # discard categories with number of samples < 500
    d = {'cat':docs_df['main_category2'].value_counts().index, 'count': docs_df['main_category2'].value_counts()}
    df_cat = pd.DataFrame(data=d).reset_index(drop=True)
    df_cat.drop(df_cat[df_cat['count']<500].index)
    discard_cat = df_cat[df_cat['count']<500].cat.tolist()
    docs_df = docs_df.drop(docs_df[(docs_df.main_category2.apply(lambda x: x in discard_cat)==True)].index)

    # find all categories
    cat_list = docs_df['main_category2'].unique()

    return docs_df, cat_list

# select categories
def LSTM_2_select_categories(docs_df, cat_list, n_topics, num_cal, epochs = 5, batch_size = 64):

    # get all the combinations for categories
    #list_2 = list(itertools.combinations(cat_list, 2))
    list_2 = [('math', 'nlin'), 
              ('cond-mat', 'hep-ph'),
              ('astro-ph', 'math-ph'),
              ('stat', 'q-bio'),
              ('cs', 'stat'),
              ('math', 'math-ph'),
              ('physics', 'hep-ex'),
              ('nlin', 'q-fin'),
              ('hep-ph', 'gr-qc'),
              ('nucl-th', 'q-bio')]

    m = len(list_2)

    # initialize accuracy, var
    acc = np.zeros(num_cal)
    num = np.zeros(num_cal)
    his = []

    for i in range(num_cal):
        print('pair',i)

        # define two categories
        cat1 = list_2[i][0]
        cat2 = list_2[i][1]

        # filter for papers in 'cat1' or 'cat2' area
        docs_df2 = docs_df[(docs_df.main_category.apply(lambda x: cat1 in x )==True)|(docs_df.main_category.apply(lambda x: cat2 in x )==True)]

        # define stopwords
        stopwords = nltk.corpus.stopwords.words("english")

        # remove punctuation
        docs_df2['clean_review'] = docs_df2['abstract'].apply(remove_punctuation)

        # remove stopwords
        docs_df2['cut'] = docs_df2['clean_review'].apply(lambda x: " ".join([w for w in list(jb.cut(x)) if w not in stopwords]))
        docs_df3 = docs_df2[['cut','main_category2']]

        # get the number of papers
        num[i] = len(docs_df3)

        # cat tansfer to id
        docs_df3['cat_id'] = docs_df3['main_category2'].factorize()[0]
        cat_id_df = docs_df3[['main_category2', 'cat_id']].drop_duplicates().sort_values('cat_id').reset_index(drop=True)
        cat_to_id = dict(cat_id_df.values)
        id_to_cat = dict(cat_id_df[['cat_id', 'main_category2']].values)

        # LSTM
        # Set the 50,000 most frequently used words
        MAX_NB_WORDS = 50000

        # Maximum length of each cut review
        MAX_SEQUENCE_LENGTH = 250

        # Set the dimension of the Embeddingceng layer
        EMBEDDING_DIM = 100

        tokenizer = Tokenizer(num_words = MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(docs_df3['cut'].values)
        word_index = tokenizer.word_index
        X = tokenizer.texts_to_sequences(docs_df3['cut'].values)

        # fill X
        X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
 
        # get onehot
        Y = pd.get_dummies(docs_df3['cat_id']).values

        # Split training set and test set
        X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)

        # Define model
        model = Sequential()
        model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
        model.add(SpatialDropout1D(0.2))
        model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(2, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 
        history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,
                            callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
        his.append(history.history)

        # Compute test accuracy
        acc[i] = min(history.history['val_accuracy'])

    return acc, list_2, num, his

# 2 topics classification with LSTM
def LSTM_2_classification(start, end, n_topics, num_cal, epochs = 5, batch_size = 64):

    list_year = np.arange(start, end+1)
    n = len(list_year)
    names = locals()
    his_total = locals()
    cat_list = []

    for j in range(n):
        year1 = list_year[j] - 1
        year2 = list_year[j] + 1
        docs_df, cat_list_now = LSTM_2_select_year(year1, year2)
        names['docs_df'+str(j)] = docs_df
    #     if j == 0:
    #         cat_list = cat_list_now.tolist()
    #     else:
    #         cat_list = extra_same_elem(cat_list_now.tolist(), cat_list)
    
    # cat_list = set(cat_list)
    results_acc = np.zeros((n, num_cal))
    results_num = np.zeros((n, num_cal))

    for k in range(n):
        print(k)
        docs_df = names['docs_df'+str(k)]
        acc, list_2, num, his = LSTM_2_select_categories(docs_df, cat_list, n_topics, num_cal, epochs = 1, batch_size = 64)
        print('accuracy', acc)
        print('number of papers', num)
        results_acc[k,:] = acc
        results_num[k,:] = num
        his_total['year'+str(k)] = his

    return results_acc, list_2, results_num, his_total

In [None]:
results_acc, list_2, results_num, his_total = LSTM_2_classification(2017, 2021, 2, 10, epochs = 1, batch_size = 64)