In [2]:
import json
from tqdm import tqdm
import pandas as pd
from simpletransformers.t5 import T5Model
from sklearn.model_selection import train_test_split

Считываем данные

In [3]:
def get_metadata():
    with open("../input/arxiv/arxiv-metadata-oai-snapshot.json") as f:
        for line in f:
            yield line

In [4]:
metadata = get_metadata()

for paper in metadata:
    first_paper = json.loads(paper)
    break
    
for key in first_paper:
    print(key, ':', first_paper[key])

In [5]:
author = [] 
title = []
categories = []
abstract = []

n_journal_publicated = 0

for ind, paper in tqdm(enumerate(metadata)):
    paper = json.loads(paper)
    if paper['journal-ref'] != None:
        n_journal_publicated += 1      
        author.append(paper['submitter'])
        title.append(paper['title'])
        categories.append(paper['categories'])
        abstract.append(paper['abstract'])

print(f'paper publicated on journals is: {n_journal_publicated}')

In [6]:
df = pd.DataFrame({'author':author,
                   'title':title,
                   'categories':categories, 
                   'abstract':abstract})
df.head()

In [7]:
df.shape

Simpletransformers implementation of T5 model expects a data to be a dataframe with 3 columns: prefix, input_text, target_text

prefix: A string indicating the task to perform

input_text: The input text sequence

target_text: The target sequence

In [8]:
summarize = ['summarize'] * df.shape[0]

df_t5 = pd.DataFrame({'prefix':summarize,
                   'input_text':abstract,
                   'target_text':title})

df_t5.head()

In [14]:
df_t5 = df_t5.iloc[:10000]

Делим на выборки, обучаем модель

In [15]:
train, test = train_test_split(df_t5, test_size=0.3)

In [10]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 512,
    "train_batch_size": 4,
    "num_train_epochs": 4,
}

model = T5Model("t5", "t5-small", args=model_args, use_cuda=True)

In [16]:
model.train_model(train)

In [17]:
res = model.eval_model(test)

In [18]:
res

Посмотрим, как модель предсказывает названия:

In [26]:
random_num = 300
actual_title = test.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+test.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print()
print(f'Predicted Title: {predicted_title[0]}')
print()
print(f'Actual Abstract: {actual_abstract[0]}')

In [25]:
random_num = 980
actual_title = test.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+test.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print()
print(f'Predicted Title: {predicted_title[0]}')
print()
print(f'Actual Abstract: {actual_abstract[0]}')