In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import gensim
import operator
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_punctuation
print((gensim.__version__))  # needed 3.8.3   -> pip install gensim==3.8.3
from sklearn.linear_model import LogisticRegression
import functions
import re
from itertools import chain
from collections import Counter
import tqdm
from tqdm import tqdm

4.1.2


## Upload sources - trained models, data

In [3]:
train_year_of_citations = 2021

# which articles based on year of publication will be selected for training and testing
min_train_year_published = 2019
max_train_year_published = 2020
min_pred_year_published = 2022
max_pred_year_published = 2022

embeddings_from_year = 2019
embeddings_to_year = 2020

classifier = "lr"  # or "rf"

In [4]:
df_all = pd.read_csv("outputs/df_sw_tok_low_punc_lemm_v7.csv").rename(columns = {'doi_x':'doi'})

In [5]:
df_all_from = df_all[(df_all['Year']<=2022) & (df_all['Year']>=2021)]

# Upload models

In [6]:
lreg_w2v_avg = pickle.load(open('outputs/classifier/train_'+str(train_year_of_citations)+'/'+'lreg_w2v_avg_'+str(train_year_of_citations)+'.sav', 'rb'))
model_w2v = gensim.models.Word2Vec.load("outputs/w2v/w2v_published_between_"+str(embeddings_from_year) + " and "+ str(embeddings_to_year)+".model")

# Importance of lreg w2v

In [7]:
#### words of word2vec model whole dictionary - based on published articles 2019-2022 
words = model_w2v.wv.key_to_index.keys()
we_dict = {word:model_w2v.wv[word] for word in words}
words_list_total = pd.DataFrame(we_dict.items())
print(len(words_list_total))

82402


In [8]:
score_of_word =  functions.score_of_word(model_w2v,lreg_w2v_avg).sort_values(by = ["score"], ascending=False)

### Add cnt of articles and first year

In [9]:
def add_cnt_info(score_of_word, df_all, top_n = 40000):

    final_results_of_select = score_of_word[score_of_word["word"].isin(list(words_list_total[0].values))]
    df_all["abstract_cleaned_tok"] = functions.tokenized_column(df_all["abstract_cleaned"])
    df_all_list_of_lists = list(df_all["abstract_cleaned_tok"].values)

    corpus = df_all.abstract_cleaned
    words = ' '.join(corpus)
    output = Counter(words.split()).most_common()
    cnt_in_all_articles = pd.DataFrame(output,columns=["index","cnt_in_all_articles"])
    
    cnt = dict(Counter(chain.from_iterable(set(l) for l in df_all_list_of_lists)))
    cnt_articles = pd.DataFrame(cnt,index=["cnt_of_articles"]).transpose().reset_index()
    
    final_results_of_select = pd.merge(final_results_of_select, cnt_in_all_articles, left_on=  ['word'],
                   right_on= ['index'], 
                   how = 'left')
    
    final_results_of_select = pd.merge(final_results_of_select, cnt_articles, left_on=  ['word'],
                   right_on= ['index'], 
                   how = 'left')
    
    final_results_of_select = final_results_of_select[["word","score","cnt_in_all_articles","cnt_of_articles"]]
    
    top_df = final_results_of_select[:top_n]
 
    return top_df

In [10]:
top_df = add_cnt_info(score_of_word, df_all, top_n = 5000)

### Add first year

In [11]:
def score_info_wo_target_w2(top_df, df_all, top_n = 40000):
    
    from sklearn.feature_extraction.text import CountVectorizer
    cvec = CountVectorizer(analyzer = "word", tokenizer=lambda txt: txt.split(), 
                       ngram_range=(1,1),
                       binary= True,
                       min_df = 3
                      ) 
    matrix_bow_train = cvec.fit_transform(df_all['abstract_cleaned'])
    tokens_bow_train = cvec.get_feature_names_out()
    matrix_bow_train_pd = pd.DataFrame.sparse.from_spmatrix(matrix_bow_train, columns = tokens_bow_train,index=df_all.Year)
    matrix_bow_train_pd = matrix_bow_train_pd[[col for col in matrix_bow_train_pd.columns if col in list(top_df.word.values)]]
    matrix_bow_train_pd = matrix_bow_train_pd.reset_index()
    
    for col in tqdm(matrix_bow_train_pd.columns[1:(int(top_n))]):
        matrix_bow_train_pd[col] = matrix_bow_train_pd[col]*matrix_bow_train_pd['Year']
        #matrix_bow_train_pd[col] = np.where(matrix_bow_train_pd[col]==1,matrix_bow_train_pd['Year'],0)
        
    matrix_bow_train_pd=matrix_bow_train_pd.mask(matrix_bow_train_pd==0)
    matrix_bow_train_pd=matrix_bow_train_pd.fillna(10000)
    min_df = matrix_bow_train_pd.min()

    return pd.merge(top_df,min_df.reset_index(), left_on=['word'], right_on= ['index'],  how = 'left')

In [12]:
score_of_word_with_info = score_info_wo_target_w2(top_df, df_all, top_n = len(top_df))

  matrix_bow_train_pd = matrix_bow_train_pd.reset_index()
100%|██████████████████████████████████████████████████████████████████████████████| 4999/4999 [02:01<00:00, 41.10it/s]


In [13]:
score_of_word_with_info["quantile"]=pd.cut(score_of_word_with_info.score, bins=10, right=True,labels=["1","2","3","4","5","6","7","8","9","10"])
score_of_word_with_info = score_of_word_with_info.dropna()

## Add relevant articles - articles with the highest number of words appeared in

- from all articles 2019-2021

In [14]:
top_df = score_of_word_with_info

top_n = len(top_df)

from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(analyzer = "word", tokenizer=lambda txt: txt.split(), 
                       ngram_range=(1,1),
                       binary= False,
                       min_df = 1
                      ) 
matrix_bow_train = cvec.fit_transform(df_all_from['abstract_cleaned'])
tokens_bow_train = cvec.get_feature_names_out()
matrix_bow_train_pd = pd.DataFrame.sparse.from_spmatrix(matrix_bow_train, columns = tokens_bow_train,index=df_all_from.doi)
matrix_bow_train_pd = matrix_bow_train_pd[[col for col in matrix_bow_train_pd.columns if col in list(top_df.word.values)]]

In [15]:
matrix_bow_train_pd = matrix_bow_train_pd.reset_index()

  matrix_bow_train_pd = matrix_bow_train_pd.reset_index()


In [16]:
top_dois = []
top_cnt = []
word_list = []
for col in tqdm(matrix_bow_train_pd.columns[1:(int(top_n))]):
    top = matrix_bow_train_pd[[col]+["doi"]].sort_values(col,ascending=False)[:3]
    top_dois.append(str(top["doi"].values))
    top_cnt.append(str(list(top[col].values)))
    word_list.append(col)

100%|██████████████████████████████████████████████████████████████████████████████| 4993/4993 [04:28<00:00, 18.63it/s]


In [17]:
fin = pd.DataFrame(zip(top_dois,top_cnt,word_list),columns = ["doi","cnt_of_words","word"])

In [18]:
score_of_word_with_info = score_of_word_with_info.rename({0:"first_year"},axis=1).sort_values("score",ascending=False)

In [19]:
df_fin = score_of_word_with_info.merge(fin,on="word",how="left")
df_fin

Unnamed: 0,word,score,cnt_in_all_articles,cnt_of_articles,index,first_year,quantile,doi,cnt_of_words
0,mdd,0.999959,1001,284,mdd,1990.0,10,['10.1007/s40273-021-01019-4' '10.1016/j.jad.2...,"[16, 12, 12]"
1,ocd,0.999880,887,204,ocd,2006.0,10,['10.2196/26715' '10.3389/fpsyt.2021.677567' '...,"[15, 14, 12]"
2,delirium,0.999785,2667,787,delirium,2001.0,10,['10.1186/s12912-021-00543-0' '10.1186/s13063-...,"[18, 17, 17]"
3,rhinitis,0.999598,870,482,rhinitis,1977.0,10,['10.2196/33941' '10.3390/jcm10143183' '10.117...,"[8, 8, 7]"
4,ibs,0.999479,475,113,ibs,2003.0,10,['10.53350/pjmhs211572062' '10.1111/jgh.15466'...,"[14, 14, 14]"
...,...,...,...,...,...,...,...,...,...
4995,cobalamin,0.684859,67,26,cobalamin,1996.0,1,['10.3389/fmed.2021.807017' '10.3390/nu1306191...,"[3, 2, 2]"
4996,sgm,0.684791,201,46,sgm,2013.0,1,['10.1002/jia2.25728' '10.2147/dddt.s288829' '...,"[14, 10, 9]"
4997,casp,0.684779,96,82,casp,2016.0,1,['10.3390/bioengineering9030118' '10.1002/prot...,"[3, 3, 3]"
4998,methodswe,0.684670,129,129,methodswe,2005.0,1,['10.1101/2020.12.30.20248929' '10.1101/2021.0...,"[1, 1, 1]"


In [20]:
df_fin.to_csv("outputs/classifier/train_2021/score_of_word_with_info.csv")