In [217]:
import pandas as pd                     # data frame manipulations
import plotly.express as px             # plotting functions
import numpy as np                      # number manipulations
import re                               # text cleaning
import string                           # text cleaning
import contractions                     # text cleaning
from datetime import datetime           # timing functions
from nltk.corpus import stopwords       # stop words
import nltk                             # text processing
from nltk.stem import WordNetLemmatizer # Lemmatization
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import ne_chunk

# Reading and Cleaning

In [218]:
df = pd.read_csv("pubmed-cancer.csv")
df.shape
print(df.columns)

Index(['year', 'pmid', 'title', 'full_journal_title', 'free', 'journal',
       'cited_by', 'abstract', 'impact_factor', 'overall_rank'],
      dtype='object')


In [219]:
df.shape

(40435, 10)

In [220]:
df.columns

Index(['year', 'pmid', 'title', 'full_journal_title', 'free', 'journal',
       'cited_by', 'abstract', 'impact_factor', 'overall_rank'],
      dtype='object')

In [221]:
df.isnull().sum()

year                      1
pmid                      0
title                     1
full_journal_title        0
free                      0
journal                   1
cited_by              11382
abstract                243
impact_factor         30019
overall_rank          30019
dtype: int64

In [222]:
df = df.dropna(subset=['year', 'journal', 'abstract', 'title', 'cited_by'])
print(df.isnull().sum())
df.shape

year                      0
pmid                      0
title                     0
full_journal_title        0
free                      0
journal                   0
cited_by                  0
abstract                  0
impact_factor         22076
overall_rank          22076
dtype: int64


(28810, 10)

In [223]:
list(set(df.cited_by.to_list()))

['46',
 '17',
 '127',
 '24',
 '57',
 '98',
 '603',
 '280',
 '55',
 '299',
 '238',
 '125',
 '69',
 '47',
 '66',
 '35',
 '48',
 '133',
 '74',
 '190',
 '115',
 '67',
 '228',
 '19',
 '426',
 '85',
 '216',
 '332',
 '119',
 '90',
 '18,249',
 '136',
 '109',
 '9',
 '54',
 '388',
 '291',
 '167',
 '177',
 '176',
 '7',
 '78',
 '30',
 '27',
 '95',
 '163',
 '84',
 '102',
 '58',
 '37',
 '81',
 '<div class="results-amount">\n  \n    No results were found.\n  \n</div>',
 '94',
 '11',
 '32',
 '87',
 '184',
 '131',
 '146',
 '540',
 '255',
 '144',
 '23',
 '53',
 '158',
 '68',
 '49',
 '80',
 '50',
 '44',
 '63',
 '656',
 '181',
 '252',
 '173',
 '139',
 '342',
 '33',
 '314',
 '86',
 '20',
 '129',
 '310',
 '62',
 '31',
 '281',
 '261',
 '187',
 '79',
 '73',
 '117',
 '322',
 '298',
 '113',
 '6',
 '83',
 '52',
 '65',
 '174',
 '843',
 '2',
 '247',
 '88',
 '121',
 '36',
 '70',
 '207',
 '64',
 '107',
 '72',
 '29',
 '89',
 '234',
 '166',
 '4',
 '110',
 '56',
 '172',
 '14',
 '354',
 '43',
 '106',
 '59',
 '245',
 '21

In [224]:
# remove weird results for cited by
zero_mask = df[df['cited_by'] == '<div class="results-amount">\n  \n    No results were found.\n  \n</div>'].index
df.loc[zero_mask]['cited_by'] = 0

In [225]:
# Check for PMID duplicates
ids = df['pmid']

df[ids.isin(ids[ids.duplicated()])].sort_values("pmid")

Unnamed: 0,year,pmid,title,full_journal_title,free,journal,cited_by,abstract,impact_factor,overall_rank
38453,2019.0,29124779,Prenatal diethylstilbestrol exposure and cance...,Environmental Mutagenesis,False,environ mol mutagen,13,In the Diethylstilbestrol [DES] Combined Cohor...,3.04,7512.0
38452,2019.0,29124779,Prenatal diethylstilbestrol exposure and cance...,Environmental And Molecular Mutagenesis,False,environ mol mutagen,13,In the Diethylstilbestrol [DES] Combined Cohor...,3.06,5607.0
25845,2018.0,29234950,The evolution of endometrial carcinoma classif...,Virchows Archiv,False,virchows arch,3,Uterine cancer was first subclassified based o...,4.53,3124.0
25846,2018.0,29234950,The evolution of endometrial carcinoma classif...,Virchows Archiv-An International Journal Of Pa...,False,virchows arch,3,Uterine cancer was first subclassified based o...,,
12576,2018.0,29345977,Targeted and Nontargeted α-Particle Therapies,Annual Review Of Biomedical Engineering,True,annu rev biomed eng,22,α-Particle irradiation of cancerous tissue is ...,11.57,786.0
...,...,...,...,...,...,...,...,...,...,...
20364,2022.0,36350516,The Role of DNA Methylation and DNA Methyltran...,Identification Of The Cf Cystic Fibrosis Gene,False,adv exp med biol,4,The malignant transformation of normal cells i...,,
20363,2022.0,36350516,The Role of DNA Methylation and DNA Methyltran...,I Domain Integrins,False,adv exp med biol,4,The malignant transformation of normal cells i...,,
20362,2022.0,36350516,The Role of DNA Methylation and DNA Methyltran...,Hypoxia: Through The Lifecycle,False,adv exp med biol,4,The malignant transformation of normal cells i...,,
20372,2022.0,36350516,The Role of DNA Methylation and DNA Methyltran...,Immunobiology Of Proteins And Peptides V : Vac...,False,adv exp med biol,4,The malignant transformation of normal cells i...,,


In [226]:
dup_mask = [5159, 2867, 2901, 2879, 2866, 2869]
df = df.loc[~df.index.isin(dup_mask)]

In [227]:
df = df.drop_duplicates(subset='pmid', keep='first')
df.shape

(6401, 10)

In [228]:
df.isnull().sum()

year                    0
pmid                    0
title                   0
full_journal_title      0
free                    0
journal                 0
cited_by                0
abstract                0
impact_factor         108
overall_rank          108
dtype: int64

In [229]:
# Remove remaining nulls, not worth keeping
df = df.dropna(how='any', axis=0)
df.shape

(6293, 10)

In [230]:
# Change all cited by to integer
def is_number(val):
    try:
        val = int(val)
    except ValueError:
        val = 0
    return val

df.cited_by = df.cited_by.apply(lambda x: is_number(x))

In [231]:
df.cited_by = df.cited_by.astype(int)

# Initial Visualisation

In [232]:
dfc = df.sort_values('cited_by',ascending=False).head(25)

fig = px.bar(dfc, y='title', x='cited_by', title="25 Most cited papers from search results",
             labels={"title":"Title", "cited_by":"Number of citations"},
             width=1500, height=800)
fig.update_layout(yaxis={'categoryorder':'total ascending', 'dtick':1})
fig.show()

In [233]:
df['year'] = df['year'].astype(int)

dfy = df.year.value_counts()

fig = px.bar(dfy, x='year', title='Search results by year',
             labels={
                'index':'Year',
                'year':'Search results'
             })
fig.show()

In [234]:
dfg = df.journal.value_counts().nlargest(25)

fig = px.bar(dfg, x='journal', title="Top 25 journal results for 'cancer' search term",
             labels={
                'index':'Journal',
                'journal':'Search Results'
             },
             width=1500, height=800)
fig.update_layout(yaxis={'categoryorder':'total ascending', 'dtick':1})
fig.show()


In [235]:
df[df['journal'] == 'sci rep']

Unnamed: 0,year,pmid,title,full_journal_title,free,journal,cited_by,abstract,impact_factor,overall_rank
1451,2021,34916573,Traits of a mussel transmissible cancer are re...,Nan,True,sci rep,2,Some cancers have evolved the ability to sprea...,7.52,25.0
1504,2018,29666402,"Expression of KK-LC-1, a cancer/testis antigen...",Nan,True,sci rep,8,Kita-Kyushu lung cancer antigen-1 (KK-LC-1) is...,7.52,25.0
1522,2019,31147570,Doxycycline inhibits electric field-induced mi...,Nan,True,sci rep,6,"Adenocarcinoma, large cell carcinoma and squam...",7.52,25.0
3577,2022,35013485,Non-invasive scoring of cellular atypia in ker...,Nan,True,sci rep,4,Diagnosis based on histopathology for skin can...,7.52,25.0
3588,2018,29743726,Dramatic dysbalancing of the Wnt pathway in br...,Nan,True,sci rep,2,Wnt signaling is important for breast developm...,7.52,25.0
...,...,...,...,...,...,...,...,...,...,...
38655,2020,33051548,Cancer risks associated with the germline MITF...,Nan,True,sci rep,7,The MITF(E318K) variant confers moderate risk ...,7.52,25.0
39439,2020,32782317,The proteomic analysis of breast cell line exo...,Nan,True,sci rep,29,Cancer cells release small extracellular vesic...,7.52,25.0
39543,2020,32948783,Expression profiles of proton-sensing G-protei...,Nan,True,sci rep,6,The proton-sensing GPCRs (pH-GPCRs) GPR4 (GPR1...,7.52,25.0
39547,2020,32251318,The pan-cancer landscape of netrin family reve...,Nan,True,sci rep,7,Recent cancer studies have found that the netr...,7.52,25.0


In [236]:
dff = df.free.value_counts()

fig = px.bar(dff, y='free', title="Search results by open-access status",
             labels={
                'index':'Open-access status',
                'free':'Search results'
             })
fig.show()

In [237]:
# Get a visualisation of the distribution of the number of title words
df['title_words'] = df['title'].apply(lambda x: len(x.split(' ')))
display(df['title_words'].mean())

fig = px.histogram(df, x='title_words', title="Distribution of the number of title words", color='free',
                   labels={'count':'Count', 'title_words':'Number of words in title',
                   'free':'Open-Access'})
fig.update_layout(xaxis=dict(dtick=5))
fig.show()

12.501827427300174

In [238]:
df[df['title_words'] == 52]['title'].astype(str).values

array(["Cancer's second genome: Microbial cancer diagnostics and redefining clonal evolution as a multispecies process: Humans and their tumors are not aseptic, and the multispecies nature of cancer modulates clinical care and clonal evolution: Humans and their tumors are not aseptic, and the multispecies nature of cancer modulates clinical care and clonal evolution"],
      dtype=object)

In [239]:
# Get a visualisation of the distribution of the number of title words
df['abs_words'] = df['abstract'].apply(lambda x: len(x.split(' ')))
display(df['abs_words'].mean())

fig = px.histogram(df, x='abs_words', title="Distribution of the number of abstract words", color='free',
                   labels={'count':'Count', 'title_words':'Number of words in abstract',
                   'free':'Open-Access'})
fig.update_layout(xaxis=dict(dtick=25))
fig.show()

151.43000158906722

In [240]:
# Visualisation of the citation distribution
fig = px.histogram(df, x='cited_by', title='Citation distribution',
                   labels={'count':'Count', 'cited_by':'Number of citations',
                   'free':'Open-Access'},
                   color='free')
fig.show()

In [241]:
display(df['cited_by'].mean())
display(df['cited_by'].describe())
df['cited_by'].max()

15.269346893373589

count    6293.000000
mean       15.269347
std        36.642628
min         0.000000
25%         3.000000
50%         6.000000
75%        13.000000
max       933.000000
Name: cited_by, dtype: float64

933

In [242]:
bigcitationcount = df[df['cited_by'] > 100].count()[0]
citedaf = df[df['cited_by'] > 100]
print(f"{bigcitationcount} papers have been cited more than 100 times.")


bfg = citedaf.groupby('journal')['cited_by'].sum().reset_index().sort_values('cited_by', ascending=False).head(10)
bfg
fig = px.bar(bfg, x='cited_by', y='journal',
             title="Top 10 journals of papers with more than 100 citations",
             labels={'journal':'Journal', 'cited_by':'Sum of citations'})
fig.update_layout(yaxis={'categoryorder':'total ascending', 'dtick':1})
fig.show()

145 papers have been cited more than 100 times.


In [243]:
df.head()

Unnamed: 0,year,pmid,title,full_journal_title,free,journal,cited_by,abstract,impact_factor,overall_rank,title_words,abs_words
0,2019,31761807,What Is Cancer?,Perspectives In Biology And Medicine,False,perspect biol med,25,This essay focuses on themes in Explaining Can...,0.94,14278.0,3,109
1,2021,33820469,Cancer and stem cells,Nan,True,exp biol med,7,Being the second leading cause of death global...,7.52,25.0,4,132
2,2018,29860986,Immunotherapy and Prevention of Pancreatic Cancer,Nan,True,trends cancer,154,Pancreatic cancer is the third-leading cause o...,7.52,25.0,6,108
3,2020,32972405,Targeting STAT3 in Cancer Immunotherapy,Molecular Cancer,True,mol cancer,173,As a point of convergence for numerous oncogen...,35.68,147.0,5,143
4,2021,33296049,Unexpected guests in the tumor microenvironmen...,Protein & Cell,True,protein cell,44,Although intestinal microbiome have been estab...,5.89,313.0,9,135


# Text cleaning and pre-processing

Now that some basic dataframe statistics have been explored, and redundant entries with little useful information are removed, we can progress to cleaning and pre-processing text

In [244]:

# stop word removal


cleanhtml = re.compile('<.*?>')
cleanparenthesis = re.compile("[\(\[].*?[\)\]]")

def super_clean_text(text):
    """Cleans text by removing html tags, expanding contracted words, removing words from within parenthesis, removing punctuation and special characters and stripping whitespace"""
    # remove html tags
    text = re.sub(cleanhtml, '', text)

    # expand contractions
    expanded_words = []
    for word in text.split():
        expanded_words.append(contractions.fix(word))
    text = ' '.join(expanded_words)

    # Remove words from within parenthesi
    text = re.sub(cleanparenthesis, "", text)

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove special characters
    special_characters=['@','#','$','*','&']
    for x in special_characters:
        text = text.replace(x, '')

    
    # Strip any remaining space
    text = text.strip()


    return text


In [245]:
df_orig = df.copy()

In [246]:
df['title'] = df['title'].apply(lambda x: super_clean_text(x))
df['abstract'] = df['abstract'].apply(lambda x: super_clean_text(x))


In [247]:
def word_freq(x, terms=50, title=''):
    """Show the top terms of a given text"""
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':fdist.values()})

    # Get top words
    d = words_df.nlargest(columns='count', n = terms)

    # Plot it
    fig = px.bar(d, x='count', y='word', title=title,
                 width=1200, height=720,
                 color='count', color_continuous_scale=['darkred', 'red'])
    fig.update_layout(barmode='stack', showlegend=False, yaxis={"categoryorder":"total ascending", "dtick":1})
    fig.show()



def remove_stopwords(text, custom_stopwords = {}):
    """Remove stop words, including additional custom words"""
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union(custom_stopwords)

    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)



In [248]:
df['title'] = df['title'].apply(lambda x: x.lower())
df['abstract'] = df['abstract'].apply(lambda x: x.lower())

In [249]:
word_freq(df['abstract'], title='Top words in all abstracts')

In [250]:
word_freq(df['title'], title="Top words in all titles")

In [251]:
# remove all stop words
df['title'] = df['title'].apply(lambda x: remove_stopwords(x))
df['abstract'] = df['abstract'].apply(lambda x: remove_stopwords(x))

In [252]:
# Lemmatization
lemmatizer = WordNetLemmatizer()


def lemmatize_text(text):
    words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    words = ' '.join(words)
    return words



df['title'] = df['title'].apply(lambda x: lemmatize_text(x))
df['abstract'] = df['abstract'].apply(lambda x: lemmatize_text(x))




In [253]:
# Remove cancer from title
custom_stops = {'cancer'}
df['titlenc'] = df['title'].apply(lambda x: remove_stopwords(x, custom_stops))
df['abstractnc'] = df['abstract'].apply(lambda x: remove_stopwords(x, custom_stops))

In [254]:
display(word_freq(df['titlenc'], title="Top terms in all titles<br>(stop words removed, text lemmatized)"))
display(word_freq(df['abstractnc'], title="Top terms in all abstracts<br>(stop words removed, text lemmatized)"))

None

None

In [255]:
x = [text for text in df['abstract']]
x = ' '.join(x)
x = x.split()

total_abstract_words = len(x)
total_unique_abstract_words = len(list(set(x)))

print("The Abstract corpus contains {:,} total words. The Abstract corpus contains {:,} unique words.".format(total_abstract_words, total_unique_abstract_words))

The Abstract corpus contains 569,416 total words. The Abstract corpus contains 29,577 unique words.


# Sentiment

In [256]:
df.to_csv("df-cleaned.csv", index=False)

In [257]:
def get_sentiment(text, printsentiment=False):
    sia = SentimentIntensityAnalyzer()
    sentiment_dict = sia.polarity_scores(text)

    neg = sentiment_dict['neg']*100
    neu = sentiment_dict['neu']*100
    pos = sentiment_dict['pos']*100
    comp = sentiment_dict['compound']
    overall = ''

    if comp >= 0.05:
        overall='positive'
    elif comp <= -0.05:
        overall = 'negative'
    else:
        overall = 'neutral'

    if printsentiment:
        print(f"Text was rated as {neg}% negative.")
        print(f"Text was rated as {neu}% neutral.")
        print(f"Text was rated as {pos}% positive.")
        print(f"Overall score: {comp}.")
    
    return comp



In [259]:
df.columns

Index(['year', 'pmid', 'title', 'full_journal_title', 'free', 'journal',
       'cited_by', 'abstract', 'impact_factor', 'overall_rank', 'title_words',
       'abs_words', 'titlenc', 'abstractnc'],
      dtype='object')

In [261]:
df['sent_comp'] = df['abstract'].apply(lambda x: get_sentiment(x))
df['title_sent_comp'] = df['title'].apply(lambda x: get_sentiment(x))

In [193]:
dfneg = df.sort_values('sent_comp').head(25)
dfpos = df.sort_values('sent_comp', ascending=False).head(25)

In [268]:
free_comp = round(df[df['free'] == True]['sent_comp'].mean()*100,1)
paid_comp = round(df[df['free'] == False]['sent_comp'].mean()*100,1)
comp_diff = abs(free_comp - paid_comp)

print(f"Mean sentiment for Paid Articles: {paid_comp}\nMean sentiment for Open Access Articles: {free_comp}.\n Paid articles are {comp_diff}% more positive than open-access articles.")


Mean sentiment for Paid Articles: -68.4
Mean sentiment for Open Access Articles: -74.5.
 Paid articles are 6.099999999999994% more positive than open-access articles.


In [195]:
dftest = df[['free', 'sent_comp']]

from scipy.stats import ttest_ind

cat1 = dftest[dftest['free'] == True]
cat2 = dftest[dftest['free'] == False]


ttest_ind(cat1['sent_comp'], cat2['sent_comp'])



Ttest_indResult(statistic=-5.229096406573942, pvalue=1.7583802227017917e-07)

In [263]:
# Remove the word cancer
df['abstract_no_cancer'] = df['abstract'].apply(lambda x: remove_stopwords(x, custom_stopwords={'cancer'}))

In [269]:
df['sent_comp2'] = df['abstract_no_cancer'].apply(lambda x: get_sentiment(x))
df['title_comp2'] = df['titlenc'].apply(lambda x: get_sentiment(x))

In [198]:
df['sent_diff'] = df['sent_comp'] - df['sent_comp2']

In [270]:
mean_sent_abs = round(df['sent_comp2'].mean(),3)
mean_sent_tit = round(df['title_comp2'].mean(),3)

print(f"The mean sentiment score for all abstracts is: {mean_sent_abs}.\nThe mean sentiment score for all titles is: {mean_sent_tit}")


The mean sentiment score for all abstracts is: 0.225.
The mean sentiment score for all titles is: 0.006


In [272]:
cat1 = df[df['free'] == True]
cat2 = df[df['free'] == False]
ttest_ind(cat1['sent_comp2'], cat2['sent_comp2'])

Ttest_indResult(statistic=-3.4227658197165, pvalue=0.0006238261648783671)

In [274]:
ttest_ind(cat1['title_comp2'], cat2['title_comp2'])

Ttest_indResult(statistic=-0.9626966000026859, pvalue=0.33573680481481216)

In [286]:

fig = px.violin(df, y=['sent_comp2'], box=True, color='free', 
                title='Abstract composite sentiment score distribution',
                labels={
    'free':'Open-access',
    'variable':'Abstract composite sentiment score'
    })
fig.show()

In [287]:
fig = px.violin(df, y=['title_comp2'], color='free', box=True,
                title="Title composite sentiment score distribution",
                labels={
    'variable':'Title composite sentiment score',
    'free':'Open-access'
                })
fig.show()

In [300]:
free_mean = cat1['sent_comp2'].mean()
paid_mean = cat2['sent_comp2'].mean()
perc_more = (paid_mean - free_mean)/free_mean

print(f"Free article mean sent: {free_mean}.\nPaid article mean: {paid_mean}. \nPaid articles are {round(perc_more*100,1)}% more positive.")
print(f"Difference: {abs(free_mean - paid_mean)}")

Free article mean sent: 0.20133433439214554.
Paid article mean: 0.2542198586572438. 
Paid articles are 26.3% more positive.
Difference: 0.05288552426509824


In [295]:
free_mean_title = cat1['title_comp2'].mean()
paid_mean_title = cat2['title_comp2'].mean()
perc_more = (paid_mean_title - free_mean_title)/free_mean_title
perc_more = round((perc_more * 100),1)
print(f"Free article title mean sentiment: {free_mean_title}.")
print(f"Paid article title mean sentiment: {paid_mean_title}.")
print(f"Paid articles are {perc_more}% more positive.")

Free article title mean sentiment: 0.0032578111464048516.
Paid article title mean sentiment: 0.009369257950530036.
Paid articles are 187.6% more positive.


In [297]:
df['free_number'] = np.where(df['free'] == True, 1, 0)

In [298]:
df[['free', 'cited_by', 'sent_comp2', 'free_number']].corr()

Unnamed: 0,free,cited_by,sent_comp2,free_number
free,1.0,0.05491,-0.043114,1.0
cited_by,0.05491,1.0,-0.007958,0.05491
sent_comp2,-0.043114,-0.007958,1.0,-0.043114
free_number,1.0,0.05491,-0.043114,1.0


In [51]:
dfg = df.groupby(['journal', 'free'])['sent_comp2'].mean().reset_index().sort_values('sent_comp2', ascending=False)
dfg_top = dfg.head(10).copy()
dfg_bot = dfg.tail(10).copy()

dfgg = pd.concat([dfg_top, dfg_bot])
dfgg.reset_index(inplace=True)
dfgg = dfgg.sort_values('sent_comp2', ascending=False)
display(dfgg)

fig = px.bar(dfgg, x='sent_comp2', y='journal', color='free',
             title="Highest and lowest mean sentiment scores per journal",
             labels={"journal":"Journal", "free":"Open-access"})
fig.update_layout(yaxis={'categoryorder':'total ascending', 'dtick':1})
fig.show()

Unnamed: 0,index,journal,free,sent_comp2
0,994,j chemother,False,0.985
1,731,expert rev clin pharmacol,False,0.985
2,433,clin adv hematol oncol,True,0.9831
3,220,biomacromolecules,False,0.9831
4,945,j adolesc young adult oncol,True,0.9825
5,966,j biomater sci polym ed,False,0.9814
6,1172,j womens health,True,0.9805
7,9,acs biomater sci eng,False,0.9802
8,920,int j radiat biol,False,0.9783
9,714,exp gerontol,False,0.9764


In [299]:
df.head()
hm = df[['cited_by', 'impact_factor', 'overall_rank', 'title_words', 'sent_comp2', 'free_number']]
hm.columns = ['No. citations', 'Impact factor', 'Overall rank', 'Words in title', 'Sentiment composite', 'Open Access']
hm = hm.corr()

fig = px.imshow(hm, text_auto=True, color_continuous_scale='RdBu_r')

fig.show()

In [215]:
fig = px.histogram(df, x='sent_comp2', color='free', nbins=20, title="Distribution of sentiment scores",
                   labels={'sent_comp2':"Composite sentiment score",
                           'free':'Open-access'})
fig.show()

In [74]:
df.corr()

Unnamed: 0,year,pmid,free,cited_by,impact_factor,overall_rank,title_words,sent_comp,sent_comp2,sent_diff
year,1.0,0.948997,0.048577,-0.183138,-0.009765,-0.005995,-0.002365,0.022209,0.04084,-0.02419
pmid,0.948997,1.0,0.099422,-0.184084,-0.011292,-0.009783,-0.006472,0.020544,0.041437,-0.026038
free,0.048577,0.099422,1.0,0.052581,0.066017,-0.173335,0.100736,-0.055662,-0.04837,0.00661
cited_by,-0.183138,-0.184084,0.052581,1.0,0.082971,-0.098936,-0.129185,-0.043274,-0.021951,-0.010523
impact_factor,-0.009765,-0.011292,0.066017,0.082971,1.0,-0.212671,0.00744,-0.00159,-0.010926,0.009739
overall_rank,-0.005995,-0.009783,-0.173335,-0.098936,-0.212671,1.0,-0.033149,-0.020551,-0.021652,0.006236
title_words,-0.002365,-0.006472,0.100736,-0.129185,0.00744,-0.033149,1.0,0.033998,-0.059437,0.084995
sent_comp,0.022209,0.020544,-0.055662,-0.043274,-0.00159,-0.020551,0.033998,1.0,0.375916,0.374607
sent_comp2,0.04084,0.041437,-0.04837,-0.021951,-0.010926,-0.021652,-0.059437,0.375916,1.0,-0.718357
sent_diff,-0.02419,-0.026038,0.00661,-0.010523,0.009739,0.006236,0.084995,0.374607,-0.718357,1.0


In [79]:
df[df['overall_rank'].notnull()].count()

year                  6732
pmid                  6732
title                 6732
full_journal_title    6732
free                  6732
journal               6732
cited_by              6732
abstract              6732
impact_factor         6732
overall_rank          6732
title_words           6732
titlenc               6732
abstractnc            6732
text                  6732
text2                 6732
sent_comp             6732
abstract_no_cancer    6732
sent_comp2            6732
sent_diff             6732
dtype: int64

In [93]:
df = pd.read_csv("pubmed-cancer.csv")
not_null = len(df[df['overall_rank'].notnull()]['journal'].unique())
journals = len(df['journal'].unique())

print(f"Total journals: {journals} -- impact factor not null: {not_null}")

Total journals: 1857 -- impact factor not null: 1822


Unnamed: 0,year,pmid,title,full_journal_title,free,journal,cited_by,abstract,impact_factor,overall_rank,title_words,abs_words
0,2019,31761807,What Is Cancer?,Perspectives In Biology And Medicine,False,perspect biol med,25,This essay focuses on themes in Explaining Can...,0.94,14278.0,3,109
1,2021,33820469,Cancer and stem cells,Nan,True,exp biol med,7,Being the second leading cause of death global...,7.52,25.0,4,132
2,2018,29860986,Immunotherapy and Prevention of Pancreatic Cancer,Nan,True,trends cancer,154,Pancreatic cancer is the third-leading cause o...,7.52,25.0,6,108
3,2020,32972405,Targeting STAT3 in Cancer Immunotherapy,Molecular Cancer,True,mol cancer,173,As a point of convergence for numerous oncogen...,35.68,147.0,5,143
4,2021,33296049,Unexpected guests in the tumor microenvironmen...,Protein & Cell,True,protein cell,44,Although intestinal microbiome have been estab...,5.89,313.0,9,135
