In [14]:
import pandas as pd
import  tarfile
import os
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/avlasova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
fragments_path = "/data/fragments.tar"
abstracts_path = "/data/abstracts.tar"

In [7]:
tar = tarfile.open(os.getenv("HOME") + fragments_path)
tar_members = tar.getmembers()
index = 0
fragments_file = tar.extractfile(tar_members[index])
fragments_df = pd.read_csv(fragments_file, sep='\t', header=None, 
                           names=['id', 'fragment_text'])

In [8]:
fragments_df

Unnamed: 0,id,fragment_text
0,PMC0014XXXXX/PMC1416283.xml/1,Blocking AMPA/kainate receptors attenuates whi...
1,PMC0014XXXXX/PMC1416283.xml/2,"Whole-cell clamped precursor, immature and mat..."
2,PMC0014XXXXX/PMC1416283.xml/3,"Precursor, immature and mature cerebellar whit..."
3,PMC0014XXXXX/PMC1416283.xml/4,NMDA (60μM) evoked an inward current in corpus...
4,PMC0014XXXXX/PMC1416283.xml/5,"Ifenprodil (10μM), which blocks NR2B-containin..."
...,...,...
350969,PMC0063XXXXX/PMC6342651.xml/25,"As shown before, treatment of muscle cells wit..."
350970,PMC0063XXXXX/PMC6342651.xml/26,The main physiological function of GLP-1 is to...
350971,PMC0063XXXXX/PMC6342651.xml/27,Expression of GLP-1R is present in peripheral ...
350972,PMC0063XXXXX/PMC6342651.xml/28,The present study examined the expression of G...


In [10]:
tar = tarfile.open(os.getenv("HOME") + abstracts_path)
tar_members = tar.getmembers()
abstracts_file = tar.extractfile(tar_members[index])
abstracts_df = pd.read_csv(abstracts_file, sep='\t', header=None, 
                           names=['id', 'abtract_text'])

In [11]:
abstracts_df

Unnamed: 0,id,abtract_text
0,PMC0014XXXXX/PMC1416283.xml/1,Glutamate-mediated damage to oligodendrocytes ...
1,PMC0014XXXXX/PMC1459476.xml/1,We describe two homologues of the mammalian di...
2,PMC0014XXXXX/PMC1457108.xml/1,"This study describes staffing, caseloads and r..."
3,PMC0018XXXXX/PMC1872050.xml/1,Evolutionary ecology predicts that parasite li...
4,PMC0018XXXXX/PMC1828212.xml/1,It has been proposed that substance P and calc...
...,...,...
9217,PMC0063XXXXX/PMC6390403.xml/1,This paper proposes the creation and applicati...
9218,PMC0063XXXXX/PMC6317363.xml/1,The study’s objectives were to investigate the...
9219,PMC0063XXXXX/PMC6354590.xml/1,Tumor growth and relapse are driven by tumor p...
9220,PMC0063XXXXX/PMC6354590.xml/2,Hayes et al. find that Vangl2 specifically lab...


In [133]:
def divide_article_id(article):
    id_splitted = article.split('/')
    article_id, second_id = id_splitted[1].split('.')[0], id_splitted[2]
    return pd.Series([article_id, second_id])

In [134]:
abstracts_df[['article_id', 'abstract_id']] = abstracts_df['id'].apply(lambda row: divide_article_id(row))

In [137]:
fragments_df[['article_id', 'fragment_id']] = fragments_df['id'].apply(lambda row: divide_article_id(row))

In [139]:
fragments_df.drop(columns=['id'], inplace=True)
abstracts_df.drop(columns=['id'], inplace=True)

In [140]:
fragments_df

Unnamed: 0,fragment_text,article_id,fragment_id
0,"Phosphoinositides (PIs),[xref] phosphorylated ...",PMC1838524,1
1,The PX domain was first identified within a se...,PMC1838524,2
2,The atomic resolution crystal and solution str...,PMC1838524,3
3,The Vam7p t-SNARE (soluble N-ethylmaleimide-se...,PMC1838524,4
4,DNA fragments encoding residues 2-122 of the y...,PMC1838524,5
...,...,...,...
361680,RNA was isolated from cells and tissues as des...,PMC6377304,32
361681,Western blotting was performed as described[xr...,PMC6377304,33
361682,"The concentration of nitrite (NO2−), the oxidi...",PMC6377304,34
361683,IL-1β was assessed using an R&D DuoSet ELISA k...,PMC6377304,35


In [141]:
texts_df = fragments_df.groupby('article_id')['fragment_text']\
            .apply(lambda x: '\n'.join(x)).reset_index()\
            .rename(columns={'fragment_text':'article_text'})

In [145]:
papers_df = pd.merge(texts_df, abstracts_df, on='article_id')

In [146]:
papers_df

Unnamed: 0,article_id,article_text,abtract_text,abstract_id
0,PMC1283128,There are two principal active end products of...,18-hydroxycortisol (18-OHF) and 18-oxocortisol...,1
1,PMC1397880,"On 4 February 1935, an agitated Honor Fell, th...","The technique of tissue culture has, throughou...",1
2,PMC1564435,Fatal self-harm is a global problem responsibl...,Although the high rate of elderly suicide is c...,1
3,PMC1602059,At least four transposable elements have been ...,Current techniques for the genetic engineering...,1
4,PMC1634803,Transient receptor potential (TRP) channels ar...,TRPM2 proteins belong to the melastatin-relate...,1
...,...,...,...,...
11242,PMC6441973,Prostate cancer is among the most common cance...,The therapeutic landscape of prostate cancer h...,1
11243,PMC6443446,"Mate, a traditional nonalcoholic drink, is an ...","Drinking mate, an infusion of the herb ilex pa...",1
11244,PMC6444363,Tat protein is neurotoxic in vitro. A variety ...,HIV-1 Tat is known to be neurotoxic and import...,1
11245,PMC6446232,"We are pleased to join the colleagues, student...",Methionine in proteins is often thought to be ...,1


In [150]:
def write_to_file(article):
    article_id, full_text, abstract, abstract_id = article
    article_sentences = nltk.tokenize.sent_tokenize(full_text)
    abstract_sentences = nltk.tokenize.sent_tokenize(abstract)
    
    full_text = '\n\n'.join(article_sentences)
    
    sep = '@highlight\n\n'
    summary = sep + ('\n\n' + sep).join(abstract_sentences)
    
    res_article = '\n\n'.join([full_text, summary])
        
    f = open( os.getenv("HOME") + '/BertSum/raw_data/{}.paper'.format(article_id),'w+')
    f.write(res_article)
    f.close()

In [151]:
papers_df.apply(lambda article: write_to_file(article), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
11242    None
11243    None
11244    None
11245    None
11246    None
Length: 11247, dtype: object