In [1]:
import numpy as np
import pandas as pd

# Prompt

Data cleaning is definitely a "practice makes perfect" skill. Using this dataset of article open-access prices paid by the WELLCOME Trust between 2012 and 2013, determine the five most common journals and the total articles for each. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal . You will need to do considerable data cleaning in order to extract accurate estimates. For a real bonus round, identify the open access prices paid by subject area.

In [2]:
wellcome = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='latin1')
print(wellcome.dtypes)
wellcome.columns = ['id', 'publisher', 'journal', 'title', 'cost']
wellcome.head()

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object


Unnamed: 0,id,publisher,journal,title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [3]:
wellcome.describe()

Unnamed: 0,id,publisher,journal,title,cost
count,1928,2127,2126,2127,2127
unique,1880,299,984,2126,1402
top,Not yet available,Elsevier,PLoS One,"Exclusive breastfeeding, diarrhoel morbidity a...",£2040.00
freq,7,387,92,2,94


In [4]:
# Ensure every element in the DataFrame is a string
wellcome = wellcome.applymap(str)

# Maximize proportion of alphanumeric characters
wellcome = wellcome.applymap(lambda item: ''.join(item.split('\n')))
wellcome = wellcome.applymap(lambda item: item.replace(':', ''))
wellcome = wellcome.applymap(lambda item: item.replace('(', ''))
wellcome = wellcome.applymap(lambda item: item.replace(')', ''))
wellcome = wellcome.applymap(lambda item: item.replace(',', ''))
wellcome = wellcome.applymap(lambda item: item.replace('-', ''))
wellcome = wellcome.applymap(lambda item: item.replace('\n', ''))
wellcome = wellcome.applymap(lambda item: item.replace('[', ''))
wellcome = wellcome.applymap(lambda item: item.replace(']', ''))
wellcome = wellcome.applymap(lambda item: item.replace('/', ''))
wellcome = wellcome.applymap(lambda item: item.replace('?', ''))

# Lowercase every entry
wellcome = wellcome.applymap(lambda item: item.lower())

# Filter out records which have a float64 type NaN as an ID or blank
wellcome = wellcome[wellcome.id.map(lambda item: item != 'nan')]
wellcome = wellcome[wellcome.id.map(lambda item: item != '')]

# Filter ID's which are only made of text
wellcome = wellcome[wellcome.id.map(lambda item: item.isalpha() == False)]

# Remove currency sign from cost variable
wellcome.cost = wellcome.cost.map(lambda item: item.replace('£', ''))
wellcome.cost = wellcome.cost.map(lambda item: item.replace('$', ''))
wellcome = wellcome[wellcome.cost.map(lambda item: item != '')]

# Convert cost variable to Pandas float type
wellcome.cost = wellcome.cost.astype(dtype='float64')

# Remove periods in journal variable
wellcome.journal = wellcome.journal.map(lambda item: item.replace('.', ''))

## Top 5 journals and article count

In [5]:
# Create new variable for journal groupings
journals = wellcome.groupby('journal', as_index=True)

# Display ranking
journals.title.count().nlargest(5)

journal
plos one                           187
journal of biological chemistry     52
neuroimage                          27
plos pathogens                      24
nucleic acids research              23
Name: title, dtype: int64

## Journal cost mean, median, and standard deviation

### Mean

In [6]:
journals.mean().sort_values('cost', ascending=False)

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
journal of paediatric urology,999999.000000
the journal of cognitive neuroscience,999999.000000
experimental cell research,999999.000000
oxford university press,999999.000000
qualitative research,999999.000000
poned1217947,999999.000000
molecluar & cellular endocrinology,999999.000000
pmedicined1203130,999999.000000
expert reviews in molecular medicine,999999.000000
genetics in medicine,999999.000000


### Median

In [7]:
journals.median().sort_values('cost', ascending=False)

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
pmedicined1203130,999999.000
expert reviews in molecular medicine,999999.000
frontiers in cognition,999999.000
the journal of cognitive neuroscience,999999.000
experimental cell research,999999.000
journal of paediatric urology,999999.000
qualitative research,999999.000
molecluar & cellular endocrinology,999999.000
genetics in medicine,999999.000
oxford university press,999999.000


### Standard deviation

In [10]:
journals.std().dropna()

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
acs chemical biology,500.670917
acs nano,35.708892
acta crystallographica section f structural biology and crystallization communications,15.605847
acta neuropathologica,68.759063
addiction,306.481292
age,237.785868
aids,281.067979
aids care,126.415851
aids uk,0.000000
alimentrary pharmacology & therapeutics,0.000000


## For a real bonus round, identify the open access prices paid by subject area.

In [9]:
# Defining grouping function based on EDA.
# The journal keywords were explored on articles costing >= mean + std
def subject_area(element):
    if 'pathogen' in element or 'disease' in element or 'disorder' in element:
        return 'disease'
    elif 'gene' in element or 'geno' in element:
        return 'genetics'
    elif 'bio' in element or 'cell' in element or 'mole' in element:
        return 'biology'
    elif 'chem' in element:
        return 'chemistry'
    elif 'cog' in element or 'neuro' in element:
        return 'neuroscience'
    elif 'plos' in element or 'qualitative' in element or 'oxford' in element or 'human' in element or 'embo' in element or 'ology' in element:
        return 'general science'
    else:
        return 'other'

# Calculating cost per journal subject area
journals.sum().groupby(subject_area, as_index=True).sum()

Unnamed: 0,cost
biology,10766401.42
chemistry,1088349.04
disease,2398302.21
general science,14826061.29
genetics,4236081.11
neuroscience,2301768.25
other,8185695.47
