# Prediction relevance of article of future year 


### Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import functions    # my own functions which are used in more notebooks
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import tqdm
import math
import kds
import pickle
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
import gensim
print((gensim.__version__))  
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import KeyedVectors
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_curve, auc, roc_auc_score,classification_report, accuracy_score,precision_score,recall_score
from re import search
from sklearn.ensemble import RandomForestClassifier
from itertools import chain

4.2.0


## Upload cleaned abstracts

In [3]:
df_all = pd.read_csv("1.Preprocessing_outputs/df_sw_tok_low_punc_lemm_v7.csv").rename(columns = {'doi_x':'doi'}).set_index("doi")

In [4]:
len(df_all)

476175

In [5]:
df_all.head(2)

Unnamed: 0_level_0,Unnamed: 0,Year,Month,abstract,cord_uid,journal,license,authors,len,language,abstract_cleaned
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10.1186/1471-2334-1-6,0,2001.0,7.0,OBJECTIVE: This retrospective chart review des...,ug7v899j,BMC Infect Dis,no-cc,"Madani, Tariq A; Al-Ghamdi, Aisha A",1158,en,retrospective chart review describes epidemiol...
10.1186/rr14,1,2000.0,8.0,Inflammatory diseases of the respiratory tract...,02tnwd4m,Respir Res,no-cc,"Vliet, Albert van der; Eiserich, Jason P; Cros...",718,en,inflammatory disease respiratory tract commonl...


## Parameters

In [6]:
train_year_of_citations = 2021

# which articles based on year of publication will be selected for training and testing
min_train_year_published = 2019
max_train_year_published = 2020
min_pred_year_published = 2021
max_pred_year_published = 2022

embeddings_from_year = 2019
embeddings_to_year = 2022

classifier = "lr"  # or "rf"

# Add target flag for dataframe with dois

In [7]:
train_df = functions.add_target_opencitatins_marginal(target_year = train_year_of_citations,df = df_all,target_col_name="target")

train_df = train_df[(df_all['Year']<=max_train_year_published) & (df_all['Year']>=min_train_year_published)] 
print(len(train_df))

predict_df = df_all[(df_all['Year']<=max_pred_year_published) & (df_all['Year']>=min_pred_year_published)]
#predict_df = df_all
print(len(predict_df))

b'Skipping line 171761: expected 5 fields, saw 8\nSkipping line 208878: expected 5 fields, saw 8\n'


0    8576
1    4996
Name: target, dtype: int64
13572
9277
281752


In [10]:
### Random sample of 1000 articles 

predict_df.sample(1000).to_csv("random_samp_1000_articles.csv")

# BOW + CF

### Features - BOW Binary input matrix

In [8]:
cvec = CountVectorizer(analyzer = "word", tokenizer=lambda txt: txt.split(), 
                       ngram_range=(1,1),
                       binary= True,
                       min_df = 1) 

matrix_bow_train = cvec.fit_transform(train_df['abstract_cleaned'])
tokens_bow_train = cvec.get_feature_names()

with open('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/tokens_bow_'+str(train_year_of_citations)+'.data', 'wb') as filehandle:
    pickle.dump(tokens_bow_train, filehandle)
    
matrix_bow_train_pd = pd.DataFrame.sparse.from_spmatrix(matrix_bow_train, columns = tokens_bow_train)
matrix_bow_train_pd = matrix_bow_train_pd[sorted(matrix_bow_train_pd.columns)]
print(matrix_bow_train.shape)

(9277, 36269)


In [9]:
# for predicting we need to have the same features like in training, not new, not less ! 
matrix_bow_test = cvec.fit_transform(predict_df['abstract_cleaned'])
tokens_bow_test = cvec.get_feature_names()
matrix_bow_test_pd = pd.DataFrame.sparse.from_spmatrix(matrix_bow_test, columns = tokens_bow_test)
print(len(matrix_bow_test_pd))

not_in_test = np.setdiff1d(tokens_bow_train,tokens_bow_test)
columns_in_both = list(set(tokens_bow_train) & set(tokens_bow_test))
matrix_bow_test_pd_without_new_col = matrix_bow_test_pd[columns_in_both]

for col in tqdm(not_in_test): 
    matrix_bow_test_pd_without_new_col[col] =  np.nan
print(len(matrix_bow_test_pd_without_new_col.columns))

matrix_bow_test_pd = matrix_bow_test_pd_without_new_col.fillna(0)
matrix_bow_test_pd = matrix_bow_test_pd[sorted(matrix_bow_test_pd.columns)]

281752


100%|██████████| 48/48 [00:00<00:00, 344.10it/s]


36269


In [10]:
X_train_bow = matrix_bow_train_pd
y_train_bow = train_df.target

#  Classifier

In [11]:
if classifier == "lr":
    cf_bow_splitted_train = LogisticRegression(random_state=0, penalty = "l1",solver = "saga").fit(X_train_bow, y_train_bow)
    pickle.dump(cf_bow_splitted_train, open('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/lreg_bow_'+str(train_year_of_citations)+'.sav', 'wb'))

if classifier == "rf":
    cf_bow_splitted_train = RandomForestClassifier(random_state=0).fit(X_train_bow, y_train_bow)
    pickle.dump(cf_bow_splitted_train, open('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/rf_bow_'+str(train_year_of_citations)+'.sav', 'wb'))
    
y_pred_bow_test = cf_bow_splitted_train.predict_proba(matrix_bow_test_pd)[:,1]

### Articles with highest score

In [12]:
high_score_art_lr_bow = pd.DataFrame(list(zip(list(y_pred_bow_test), list(train_df.abstract.values),list(train_df.abstract_cleaned.values),list(predict_df.index),list(predict_df.Year.values) )),columns =["score","abstract","abstract_cleaned","doi","Year"])
high_score_art_lr_bow = high_score_art_lr_bow.sort_values("score",ascending=False)
high_score_art_lr_bow[["score","doi","Year"]][:5]

Unnamed: 0,score,doi,Year
789,0.997247,10.1186/s12943-021-01310-0,2021.0
82,0.994012,10.1038/s41467-020-20544-y,2021.0
3339,0.990473,10.1186/s12199-021-00995-5,2021.0
1350,0.989359,10.3324/haematol.2020.262485,2021.0
3080,0.989093,10.1111/psyp.13796,2021.0


In [13]:
len(high_score_art_lr_bow)

9277

### add info 

In [14]:
df_all = df_all.reset_index()

In [15]:
score_df = high_score_art_lr_bow.merge(df_all[["doi","authors","journal"]],on="doi",how="left")

### Add n most important words 

- vybírám maximum odds ratio - tedy jen slova která ZVYŠUJÍ P to be highly cited.

In [16]:
score_df["abstract_cleaned_tok"] = functions.tokenized_column(score_df["abstract_cleaned"])

feature_importance = functions.importance_lr_bow(tokens_bow_train,cf_bow_splitted_train,n=len(tokens_bow_train),odds_ratio = False)
#feature_importance["score_abs"] = abs(feature_importance["score"])
fi_dict = dict(zip(feature_importance.word,feature_importance.score))

n=10
row_list = []
for row in list(range(0,len(score_df.abstract_cleaned_tok))): 
    words_abstract = list(chain.from_iterable(list(score_df.abstract_cleaned_tok[row:row+1])))
    words_abstract_score = dict(((key, fi_dict[key]) for key in words_abstract))
    words_abstract_score_sort = dict(sorted(words_abstract_score.items(), key=lambda item: item[1],reverse=True)) 
    
    top_n = {k: words_abstract_score_sort[k] for k in list(words_abstract_score_sort)[:n]}
    
    #round values in dictionary
    res = dict()
    for key in top_n:  
        res[key] = round(top_n[key], 2)
    row_list.append(res)
    
score_df["top_n_words"] = row_list

In [17]:
score_df.to_csv("final_articles_score_table_bow.csv")

In [18]:
pd.set_option('display.max_colwidth', None)
score_df[["doi","score","Year","authors","journal","top_n_words"]][:10]

Unnamed: 0,doi,score,Year,authors,journal,top_n_words
0,10.1186/s12943-021-01310-0,0.997247,2021.0,"Huang, Xing; Zhang, Gang; Tang, Tianyu; Liang, Tingbo",Mol Cancer,"{'in-silico': 0.78, 'non-structural': 0.56, 'antiviral': 0.45, 'carried': 0.32, 'entire': 0.31, 'best': 0.3, 'towards': 0.27, 'spike': 0.26, 'receptor': 0.24, 'chinese': 0.23}"
1,10.1038/s41467-020-20544-y,0.994012,2021.0,"Mistry, Dina; Litvinova, Maria; Pastore y Piontti, Ana; Chinazzi, Matteo; Fumanelli, Laura; Gomes, Marcelo F. C.; Haque, Syed A.; Liu, Quan-Hui; Mu, Kunpeng; Xiong, Xinyue; Halloran, M. Elizabeth; Longini, Ira M.; Merler, Stefano; Ajelli, Marco; Vespignani, Alessandro",Nat Commun,"{'unfolded': 0.61, 'jnk': 0.31, 'component': 0.29, 'degradation': 0.25, 'support': 0.22, 'type': 0.22, 'virus': 0.21, 'translocation': 0.19, 'chop': 0.17, 'severe': 0.12}"
2,10.1186/s12199-021-00995-5,0.990473,2021.0,"Johnson, Natalie M.; Hoffmann, Aline Rodrigues; Behlen, Jonathan C.; Lau, Carmen; Pendleton, Drew; Harvey, Navada; Shore, Ross; Li, Yixin; Chen, Jingshu; Tian, Yanan; Zhang, Renyi",Environ Health Prev Med,"{'sequenced': 0.89, 'forecasting': 0.77, 'ten': 0.57, 'collection': 0.51, 'recombinant': 0.47, 'detecting': 0.25, 'thousand': 0.22, 'appear': 0.21, 'virus': 0.21, 'scenario': 0.16}"
3,10.3324/haematol.2020.262485,0.989359,2021.0,"Di Buduo, Christian A.; Aguilar, Alicia; Soprano, Paolo M.; Bocconi, Alberto; Miguel, Carolina P.; Mantica, Giovanna; Balduini, Alessandra",Haematologica,"{'allocation': 0.57, 'limitation': 0.48, 'short-term': 0.39, 'appropriately': 0.34, 'comorbidities': 0.28, 'outcome': 0.24, 'make': 0.22, 'score': 0.22, 'covid-19': 0.22, 'necessary': 0.21}"
4,10.1111/psyp.13796,0.989093,2021.0,"Zimmerman, Benjamin; Rypma, Bart; Gratton, Gabriele; Fabiani, Monica",Psychophysiology,"{'chemistry': 0.78, 'humidity': 0.49, 'across': 0.46, 'air': 0.39, 'sensor': 0.35, 'potentially': 0.26, 'covid-19': 0.22, 'movement': 0.21, 'interaction': 0.2, 'reflect': 0.13}"
5,10.3390/bios11080253,0.987647,2021.0,"Taha, Bakr Ahmed; Ali, Norazida; Sapiee, Nurfarhana Mohamad; Fadhel, Mahmoud Muhanad; Mat Yeh, Ros Maria; Bachok, Nur Nadia; Al Mashhadany, Yousif; Arsad, Norhana",Biosensors (Basel),"{'averaged': 0.57, 'greatest': 0.53, 'january': 0.47, 'pollution': 0.42, 'established': 0.4, 'quarantine': 0.39, 'air': 0.39, 'china': 0.33, 'particulate': 0.32, 'approximately': 0.32}"
6,10.1038/s42003-021-02706-w,0.985561,2021.0,"Natu, Vaidehi S.; Rosenke, Mona; Wu, Hua; Querdasi, Francesca R.; Kular, Holly; Lopez-Alvarez, Nancy; Grotheer, Mareike; Berman, Shai; Mezer, Aviv A.; Grill-Spector, Kalanit",Commun Biol,"{'recovered': 0.51, 'igm': 0.49, 'perinatal': 0.46, 'polymerase': 0.42, 'critically': 0.32, 'completed': 0.29, 'potentially': 0.26, 'spike': 0.26, 'antibody': 0.25, 'receptor': 0.24}"
7,10.1186/s13059-021-02363-6,0.985524,2021.0,"Ho, Won Jin; Erbe, Rossin; Danilova, Ludmila; Phyo, Zaw; Bigelow, Emma; Stein-O’Brien, Genevieve; Thomas, Dwayne L.; Charmsaz, Soren; Gross, Nicole; Woolman, Skylar; Cruz, Kayla; Munday, Rebecca M.; Zaidi, Neeha; Armstrong, Todd D.; Sztein, Marcelo B.; Yarchoan, Mark; Thompson, Elizabeth D.; Jaffee, Elizabeth M.; Fertig, Elana J.",Genome Biol,"{'checklist': 0.84, 'urgently': 0.75, 'guiding': 0.69, 'finally': 0.65, 'mostly': 0.52, 'systematic': 0.42, 'ovid': 0.36, 'diagnosing': 0.34, 'pubmed': 0.34, 'probably': 0.33}"
8,10.1371/journal.pone.0258333,0.985388,2021.0,"Velten, Julia; Scholten, Saskia; Brailovskaia, Julia; Margraf, Jürgen",PLoS One,"{'chloroquine': 0.58, 'antiviral': 0.45, 'phenotype': 0.45, 'mixture': 0.4, 'statistically': 0.39, 'ongoing': 0.36, 'best': 0.3, 'criterion': 0.29, 'prospective': 0.29, 'society': 0.28}"
9,10.1038/s41467-021-24885-0,0.984414,2021.0,"Salehinejad, Mohammad Ali; Wischnewski, Miles; Ghanavati, Elham; Mosayebi-Samani, Mohsen; Kuo, Min-Fang; Nitsche, Michael A.",Nat Commun,"{'kinetics': 0.43, 'dataset': 0.37, 'crp': 0.36, 'interpretation': 0.34, 'carried': 0.32, 'rather': 0.31, 'stage': 0.29, 'serial': 0.26, 'retrospective': 0.25, 'outcome': 0.24}"


# EMBEDDINGS

## Agregating w2v Embeddings + LR

- w2v was trained on all articles from kaggle (with or without target - openctitations) - to the target year of publication

In [19]:
model_w2v = gensim.models.Word2Vec.load("2.Train_embeddings_outpus/w2v_published_only_english_between_"+str(embeddings_from_year) + " and "+ str(embeddings_to_year)+".model")

In [20]:
train_df["abstract_tokenized"] = functions.tokenized_column(train_df.reset_index()["abstract_cleaned"])
df_X_train_avg = functions.transform_to_document_vector(text_col_tokenized = train_df.reset_index().abstract_tokenized,model = model_w2v,index_col_list = list(train_df.index),agg_func = "avg").fillna(0)

predict_df["abstract_tokenized"] = functions.tokenized_column(predict_df.reset_index()["abstract_cleaned"])
df_X_predict_avg = functions.transform_to_document_vector(text_col_tokenized = predict_df.reset_index().abstract_tokenized,model = model_w2v,index_col_list = list(predict_df.index),agg_func = "avg").fillna(0)

100%|██████████| 9277/9277 [00:36<00:00, 251.48it/s]
100%|██████████| 281752/281752 [21:42<00:00, 216.31it/s]


## LR

In [21]:
X_train_w2v_avg = df_X_train_avg.reset_index().rename({"index":"doi"},axis="columns")
X_test_w2v_avg = df_X_predict_avg.reset_index().rename({"index":"doi"},axis="columns")
y_train_w2v = y_train_bow

In [22]:
X_test_w2v_avg = X_test_w2v_avg.set_index("doi")

In [23]:
X_train_w2v_avg = X_train_w2v_avg.set_index("doi")

In [24]:
if classifier == "lr":
    cf_w2v_splitted_train_avg = LogisticRegression(random_state=0,penalty = "l1",solver = "saga").fit(X_train_w2v_avg, y_train_w2v)
    pickle.dump(cf_w2v_splitted_train_avg, open('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/'+'lreg_w2v_avg_'+str(train_year_of_citations)+'.sav', 'wb'))
    
if classifier == "rf":
    cf_w2v_splitted_train_avg = RandomForestClassifier(random_state=0).fit(X_train_w2v_avg, y_train_w2v)
    pickle.dump(cf_w2v_splitted_train_avg, open('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/'+'rf_w2v_avg_'+str(train_year_of_citations)+'.sav', 'wb'))
     
y_pred_w2v_predict_avg = cf_w2v_splitted_train_avg.predict_proba(df_X_predict_avg)[:,1]

### Abstracts with highest score

In [25]:
high_score_art_lr_w2v_avg = pd.DataFrame(list(zip(list(y_pred_w2v_predict_avg), list(predict_df.abstract.values),list(predict_df.abstract_cleaned),list(predict_df.index),list(predict_df.Year.values))),columns =["score","abstract","abstract_cleaned","doi","Year"])
high_score_art_lr_w2v_avg = high_score_art_lr_w2v_avg.sort_values("score",ascending=False)
high_score_art_lr_w2v_avg.to_csv('3.Classifiers_outputs/train_'+str(train_year_of_citations)+'/'+'high_score_art_lr_w2v_avg.csv')
high_score_art_lr_w2v_avg[["score","doi","Year"]][:10]

Unnamed: 0,score,doi,Year
105453,0.907235,10.31234/osf.io/38c7u,2021.0
187186,0.897899,10.1093/infdis/jiab355,2021.0
82162,0.871998,10.1142/s0219477521500528,2021.0
109290,0.859444,10.1093/jtm/taac025,2022.0
257888,0.845444,10.1093/cid/ciab909,2021.0
210697,0.843806,10.3390/nu14030693,2022.0
156453,0.842281,10.3390/jcm11051336,2022.0
258954,0.841745,10.1002/rmv.2270,2021.0
57445,0.839105,10.1101/2022.01.04.474803,2022.0
159722,0.836811,10.21203/rs.3.rs-1362445/v1,2022.0


In [26]:
score_df = high_score_art_lr_w2v_avg.merge(df_all[["doi","authors","journal"]],on="doi",how="left")

In [27]:
score_df

Unnamed: 0,score,abstract,abstract_cleaned,doi,Year,authors,journal
0,0.907235,Neuropsychiatric symptoms caused by COVID-19,neuropsychiatric symptom caused covid-19,10.31234/osf.io/38c7u,2021.0,"Inasaridze, Ketevan",
1,0.897899,"Whether monoclonal antibodies are able to neutralise SARS-CoV-2 variants of concern has been investigated using pseudoviruses. In this study we show that bamlanivimab, casirivimab, and imdevimab efficiently neutralise authentic SARS-CoV-2 including variant B.1.1.7 (Alpha) but variants B.1.351 (Beta) and P.2 (Zeta) were resistant against bamlanivimab and partially to casirivimab.",whether monoclonal antibody able neutralise sars-cov-2 variant concern investigated using pseudoviruses study show bamlanivimab casirivimab imdevimab efficiently neutralise authentic sars-cov-2 including variant b117 alpha variant b1351 beta p2 zeta resistant bamlanivimab partially casirivimab,10.1093/infdis/jiab355,2021.0,"Widera, Marek; Wilhelm, Alexander; Hoehl, Sebastian; Pallas, Christiane; Kohmer, Niko; Wolf, Timo; Rabenau, Holger F; Corman, Victor M; Drosten, Christian; Vehreschild, Maria J G T; Goetsch, Udo; Gottschalk, Rene; Ciesek, Sandra",J Infect Dis
2,0.871998,"In this study, we analyzed daily records of newly diagnosed cases in Wuhan, Hubei excluding Wuhan (HEW), and China excluding Hubei (CEH) to investigate the impact of the new coronavirus outbreak in Wuhan on cities around it and throughout China. We used multifractal detrended cross-correlation analysis (MF-DXA) method to investigate the correlations between the daily number of patients in Wuhan and HEW as well as in Wuhan and CEH. We concluded that the cross-correlations between the daily number of patients in Wuhan and HEW were higher than those between the daily number of patients in Wuhan and CEH because the multifractal features of Wuhan and HEW are greater than those of Wuhan and CEH. We also found that the “Wuhan closure” conducted on January 23 resulted in a decrease in cross-correlations between Wuhan and CEH.",study analyzed daily record newly diagnosed case wuhan hubei excluding wuhan china excluding hubei investigate impact new coronavirus outbreak wuhan city around throughout china used multifractal detrended cross-correlation analysis method investigate correlation daily number patient wuhan well wuhan concluded cross-correlations daily number patient wuhan higher daily number patient wuhan multifractal feature wuhan greater wuhan also found “wuhan conducted january resulted decrease cross-correlations wuhan,10.1142/s0219477521500528,2021.0,"Wang, Jian Shao Wei Yan Yan Kim Junseok",Fluctuation and Noise Letters
3,0.859444,"Given the heterogeneity in individual transmissibility, we estimated the superspreading potential of SARS-CoV-2 Delta variants. Using case series of Delta variants in Guangdong, China, we found 15% (95%CrI: 12, 19) of cases seeded 80% of offspring cases.",given heterogeneity individual transmissibility estimated superspreading potential sars-cov-2 delta variant using case series delta variant guangdong china found 95cri case seeded offspring case,10.1093/jtm/taac025,2022.0,"Zhao, Shi; Guo, Zihao; Chong, Marc Ka Chun; He, Daihai; Wang, Maggie H",J Travel Med
4,0.845444,"Beta (B.1.351) variant COVID-19 disease was investigated in Qatar. Compared to Alpha (B.1.1.7) variant, odds of progressing to severe disease were 1.24-fold (95% CI: 1.11-1.39) higher for Beta. Odds of progressing to critical disease were 1.49-fold (95% CI: 1.13-1.97) higher. Odds of COVID-19 death were 1.57-fold (95% CI: 1.03-2.43) higher.",beta b1351 variant covid-19 disease investigated qatar compared alpha b117 variant odds progressing severe disease ci higher beta odds progressing critical disease ci higher odds covid-19 death ci higher,10.1093/cid/ciab909,2021.0,"Abu-Raddad, Laith J; Chemaitelly, Hiam; Ayoub, Houssein H; Yassine, Hadi M; Benslimane, Fatiha M; Al Khatib, Hebah A; Tang, Patrick; Hasan, Mohammad R; Coyle, Peter; AlMukdad, Sawsan; Al Kanaani, Zaina; Al Kuwari, Einas; Jeremijenko, Andrew; Kaleeckal, Anvar Hassan; Latif, Ali Nizar; Shaik, Riyazuddin Mohammad; Abdul Rahim, Hanan F; Nasrallah, Gheyath K; Al Kuwari, Mohamed Ghaith; Butt, Adeel A; Al Romaihi, Hamad Eid; Al-Thani, Mohamed H; Al Khal, Abdullatif; Bertollini, Roberto",Clinical infectious diseases : an official publication of the Infectious Diseases Society of America
...,...,...,...,...,...,...,...
281747,0.013232,"How to cite this article: Singh AK, Kumar S, Aggarwal R, Trikha A. Check Central Venous Catheter Set thoroughly or Bite the Bullet! Indian J Crit Care Med 2021;25(7):832–833.",cite article singh ak kumar aggarwal r check central venous catheter set thoroughly bite bullet indian j crit care med,10.5005/jp-journals-10071-23891,2021.0,"Singh, Ashutosh K; Kumar, Sandeep; Aggarwal, Richa; Trikha, Anjan",Indian J Crit Care Med
281748,0.013159,"How to cite this article: Kumar N, Kumar A, Pradhan S, Kumar A, Singh K. Painful Blisters of Left Hand Following Extravasation of Remdesivir Infusion in COVID-19. Indian J Crit Care Med 2021;25(2):240–241.",cite article kumar n kumar pradhan kumar singh k painful blister left hand following extravasation remdesivir infusion covid-19 indian j crit care med,10.5005/jp-journals-10071-23732,2021.0,"Kumar, Neeraj; Kumar, Abhyuday; Pradhan, Swetalina; Kumar, Amarjeet; Singh, Kunal",Indian J Crit Care Med
281749,0.013087,Video 1Decompression of gastric intramural abscesses using LAMS.,video gastric intramural abscess using lam,10.1016/j.vgie.2021.05.009,2021.0,"Dolan, Russell D; McCarty, Thomas R; Papke, David James; Thompson, Christopher C",VideoGIE : an official video journal of the American Society for Gastrointestinal Endoscopy
281750,0.012949,"The pedicled anterolateral thigh flap, with or without the vastus lateralis muscle, has been described for pelvic exenteration defect reconstruction. However, its use as a free flap for this type of defect is not routinely followed. To reconstruct an extensive pelvic defect in the presence of two ostomies, we describe a free anterolateral thigh flap with deep inferior epigastric pedicles as recipient vessels.",pedicled anterolateral thigh flap without vastus lateralis muscle described pelvic exenteration defect reconstruction however use free flap type defect routinely followed reconstruct extensive pelvic defect presence two ostomy describe free anterolateral thigh flap deep inferior epigastric pedicle recipient vessel,10.1097/gox.0000000000003774,2021.0,"Prasidha, Ines; Moisidis, Elias; Hsieh, Frank",Plast Reconstr Surg Glob Open


In [28]:
score_df["len"] = score_df['abstract_cleaned'].str.len()

In [29]:
score_df = score_df.sort_values("score",ascending=False)

### Add n most important words 

In [30]:
n=10

In [31]:
score_df["abstract_cleaned_tok"] = functions.tokenized_column(score_df["abstract_cleaned"])

feature_importance = functions.score_of_word(model_w2v,cf_w2v_splitted_train_avg).sort_values(by = ["score"], ascending=False)
fi_dict = dict(zip(feature_importance.word,feature_importance.score))

row_list = []
for row in tqdm(list(range(0,len(score_df.abstract_cleaned_tok)))): 
    words_abstract = list(chain.from_iterable(list(score_df.abstract_cleaned_tok[row:row+1])))
    
    # potrebuji predfiltrovat jen ta slova z abstraktu pro ktera mam FI
    words_abstract = list(set(fi_dict.keys()).intersection(words_abstract))
    
    words_abstract_score = dict(((key, fi_dict[key]) for key in words_abstract))
    words_abstract_score_sort = dict(sorted(words_abstract_score.items(), key=lambda item: item[1],reverse=True)) 
    
    top_n = {k: words_abstract_score_sort[k] for k in list(words_abstract_score_sort)[:n]}
    
    #round values in dictionary
    res = dict()
    for key in top_n:  
        res[key] = round(top_n[key], 2)
    row_list.append(res)
    
score_df["top_n_words"] = row_list

100%|██████████| 281752/281752 [16:09<00:00, 290.49it/s]


In [32]:
pd.set_option('display.max_colwidth', None)
score_df[["doi","score","Year","authors","journal","top_n_words"]][:10]

Unnamed: 0,doi,score,Year,authors,journal,top_n_words
0,10.31234/osf.io/38c7u,0.907235,2021.0,"Inasaridze, Ketevan",,"{'neuropsychiatric': 0.99, 'symptom': 0.94, 'covid-19': 0.88, 'caused': 0.49}"
1,10.1093/infdis/jiab355,0.897899,2021.0,"Widera, Marek; Wilhelm, Alexander; Hoehl, Sebastian; Pallas, Christiane; Kohmer, Niko; Wolf, Timo; Rabenau, Holger F; Corman, Victor M; Drosten, Christian; Vehreschild, Maria J G T; Goetsch, Udo; Gottschalk, Rene; Ciesek, Sandra",J Infect Dis,"{'zeta': 1.0, 'pseudoviruses': 1.0, 'imdevimab': 1.0, 'casirivimab': 1.0, 'beta': 0.99, 'alpha': 0.99, 'bamlanivimab': 0.99, 'b117': 0.98, 'sars-cov-2': 0.98, 'b1351': 0.97}"
2,10.1142/s0219477521500528,0.871998,2021.0,"Wang, Jian Shao Wei Yan Yan Kim Junseok",Fluctuation and Noise Letters,"{'wuhan': 1.0, 'hubei': 1.0, 'multifractal': 1.0, '“wuhan': 1.0, 'cross-correlations': 0.99, 'detrended': 0.98, 'china': 0.98, 'city': 0.97, 'coronavirus': 0.96, 'cross-correlation': 0.93}"
3,10.1093/jtm/taac025,0.859444,2022.0,"Zhao, Shi; Guo, Zihao; Chong, Marc Ka Chun; He, Daihai; Wang, Maggie H",J Travel Med,"{'superspreading': 1.0, '95cri': 1.0, 'delta': 1.0, 'transmissibility': 0.99, 'sars-cov-2': 0.98, 'china': 0.98, 'variant': 0.95, 'guangdong': 0.95, 'case': 0.7, 'offspring': 0.64}"
4,10.1093/cid/ciab909,0.845444,2021.0,"Abu-Raddad, Laith J; Chemaitelly, Hiam; Ayoub, Houssein H; Yassine, Hadi M; Benslimane, Fatiha M; Al Khatib, Hebah A; Tang, Patrick; Hasan, Mohammad R; Coyle, Peter; AlMukdad, Sawsan; Al Kanaani, Zaina; Al Kuwari, Einas; Jeremijenko, Andrew; Kaleeckal, Anvar Hassan; Latif, Ali Nizar; Shaik, Riyazuddin Mohammad; Abdul Rahim, Hanan F; Nasrallah, Gheyath K; Al Kuwari, Mohamed Ghaith; Butt, Adeel A; Al Romaihi, Hamad Eid; Al-Thani, Mohamed H; Al Khal, Abdullatif; Bertollini, Roberto",Clinical infectious diseases : an official publication of the Infectious Diseases Society of America,"{'beta': 0.99, 'alpha': 0.99, 'b117': 0.98, 'b1351': 0.97, 'severe': 0.96, 'variant': 0.95, 'progressing': 0.9, 'covid-19': 0.88, 'death': 0.83, 'critical': 0.82}"
5,10.3390/nu14030693,0.843806,2022.0,"Zampelas, Antonis",Nutrients,"{'sars-cov-2': 0.98, 'coronavirus': 0.96, 'outbreak': 0.9, 'covid-19': 0.88, 'recent': 0.84, 'disease-2019': 0.82, '2019': 0.6, 'cause': 0.06}"
6,10.3390/jcm11051336,0.842281,2022.0,"Mallat, Jihad",J Clin Med,"{'sar-cov-2': 1.0, 'unparalleled': 0.99, 'scale': 0.98, 'coronavirus-2': 0.98, 'coronavirus': 0.96, 'severe': 0.96, 'covid-19': 0.88, 'world': 0.85, 'global': 0.73, 'pandemic': 0.71}"
7,10.1002/rmv.2270,0.841745,2021.0,"Chakraborty, Chiranjib; Bhattacharya, Manojit; Sharma, Ashish Ranjan",Rev Med Virol,"{'zeta': 1.0, 'b1526': 1.0, 'b16172': 1.0, 'b1617': 1.0, 'delta': 1.0, 'b1429': 1.0, 'b1427': 1.0, 'd614g': 1.0, 'voc': 1.0, 'beta': 0.99}"
8,10.1101/2022.01.04.474803,0.839105,2022.0,"Yuan, Tom Z.; Lucas, Carolina; Monteiro, Valter S.; Iwasaki, Akiko; Yang, Marisa L.; Nepita, Hector F.; Lujan Hernandez, Ana G.; Taft, Joseph M.; Frei, Lester; Reddy, Sai T.; Weber, Cédric R.; Malobisky, Kevin P.; Mesquita, Rodrigo; Sato, Aaron K.",bioRxiv,"{'omicron': 1.0, 'delta': 1.0, 'neutralizes': 0.99, 'sars-cov-2': 0.98, 'bispecific': 0.96, 'coronavirus': 0.96, 'severe': 0.96, 'brief': 0.96, 'variant': 0.95, 'receptor-binding': 0.94}"
9,10.21203/rs.3.rs-1362445/v1,0.836811,2022.0,"Chen, Jiahui; Wei, Guo-Wei",Res Sq,"{'omicron': 1.0, 'delta': 1.0, 'ba1': 1.0, 'ba2': 1.0, 'beta': 0.99, 'infectivity': 0.99, 'alpha': 0.99, 'sotrovimab': 0.99, 'lambda': 0.99, 'rapidly': 0.98}"


In [33]:
score_df.to_csv("final_articles_score_table_w2v.csv")

In [34]:
score_df = pd.read_csv("final_articles_score_table_w2v.csv")

In [35]:
corpus = pd.read_csv("sources/cord19/23_04_2022/metadata.csv")

In [36]:
score_df.merge(corpus[["doi","title"]],on="doi",how="left").to_csv("final_articles_score_table_w2v.csv")