In [1]:
#importing dependencies
import pandas as pd
import statistics as stat

In [2]:
#importing file to dataframe
file = 'cleaning_validation_utf8.csv'
df = pd.read_csv(file)

In [3]:
#exloring dataframe
print(df.head())

#counting entries in each column
print(df.count())

#checking for null values
print(df.info())

              PMID/PMCID Publisher           Journal title  \
0                    NaN       CUP  Psychological Medicine   
1             PMC3679557       ACS       Biomacromolecules   
2  23043264  PMC3506128        ACS              J Med Chem   
3    23438330 PMC3646402       ACS              J Med Chem   
4   23438216 PMC3601604        ACS              J Org Chem   

                                       Article title  \
0  Reduced parahippocampal cortical thickness in ...   
1  Structural characterization of a Model Gram-ne...   
2  Fumaroylamino-4,5-epoxymorphinans and related ...   
3  Orvinols with mixed kappa/mu opioid receptor a...   
4  Regioselective opening of myo-inositol orthoes...   

  COST (£) charged to Wellcome (inc VAT when charged)  
0                                              £0.00   
1                                           £2381.04   
2                                            £642.56   
3                                            £669.64   
4         

In [73]:
#counting unique journal titles, handling for repeats by making everything lowercase
j_titles_lower = df['Journal title'].str.lower()
j_titles_unique = j_titles_lower.unique()

print('journal titles: ' + str(len(j_titles_lower)))
print('unique journal titles: ' +str(len(j_titles_unique)))


journal titles: 2127
unique journal titles: 929


In [5]:
#pd.DataFrame(j_titles_unique).head(50)

In [13]:
#make dataframe with lowercase journal names, article titles, and filtered prices (digits only) 
jour_art_pr = pd.DataFrame()
jour_art_pr['Journal'] = df['Journal title'].str.lower()
jour_art_pr['Article'] = df['Article title']

cost = df['COST (£) charged to Wellcome (inc VAT when charged)']
cost = cost.apply(lambda x: ''.join(list(filter(str.isdigit, str(x)))))

price_filtered = []

for i in range(len(cost)):
    left = cost[i][:-2]
    right = cost[i][-2:]
    joined = left+'.'+right
    price_filtered.append(joined)


jour_art_pr['Pounds sterling'] = [float(i) for i in price_filtered]


In [33]:
#removing outliers via quartiles

prices = jour_art_pr['Pounds sterling']

#print(prices.quantile(q=.97))

without_outliers = jour_art_pr[prices < prices.quantile(q=.97)]

print(len(jour_art_pr))
print(len(without_outliers))

2127
2063


In [56]:
values = [1294.59, 1294.78, 1267.76, 2286.73, 947.07]
print(stat.mean(values))
print(stat.median(values))
print(stat.stdev(values))

1418.186
1294.59
507.30956006170436


In [58]:
med_data = without_outliers['Pounds sterling'].loc[without_outliers['Journal'] == 'acs chemical biology']
med = stat.median(med_data)

print(med)

1294.59


In [63]:
journal_name = []
article_count =  []
price_mean = []
price_median = []
price_stdev = []

for j in j_titles_unique:
    #adding unique journal names
    journal_name.append(j)
    #adding number of articles in that unique journal
    count = len(without_outliers.loc[without_outliers['Journal'] == j])
    article_count.append(count)
    #calculating mean and median price within a journal
    j_prices = without_outliers['Pounds sterling'].loc[without_outliers['Journal'] == j]
    if len(j_prices) == 0:
        price_mean.append(None)
        price_median.append(None)
    else:
        mean = stat.mean(j_prices)
        median = stat.median(j_prices)
        price_mean.append(mean)
        price_median.append(median)
    #calculating standard deviation of price within a journal
    if len(j_prices) < 2:
        price_stdev.append(None)
    else:
        stdev = stat.stdev(j_prices)
        price_stdev.append(stdev)


In [78]:
#summary statistics of prices for each journal
journal_summary = pd.DataFrame()
journal_summary['Journal title'] = j_titles_unique
journal_summary['Mean Price'] = price_mean
journal_summary['Median Price'] = price_median
journal_summary['Price Stdev'] = price_stdev

print(journal_summary.head())

                    Journal title   Mean Price  Median Price  Price Stdev
0          psychological medicine  1580.400000       2034.00   889.261941
1               biomacromolecules  2381.040000       2381.04          NaN
2                      j med chem   656.100000        656.10    19.148452
3                      j org chem   685.880000        685.88          NaN
4  journal of medicinal chemistry  1222.363333       1146.96   644.081173


In [83]:
top_journals = pd.DataFrame()
top_journals['Journal'] = journal_name
top_journals['article_count'] = article_count
top_journals = top_journals.sort_values('article_count', ascending=False)

top_5_journals = top_journals[0:5]

top_5_journals

Unnamed: 0,Journal,article_count
622,plos one,182
11,journal of biological chemistry,52
212,neuroimage,29
572,nucleic acids research,23
620,plos genetics,22
