In [1]:
import numpy as np
import pandas as pd

# Prompt

Data cleaning is definitely a "practice makes perfect" skill. Using this dataset of article open-access prices paid by the WELLCOME Trust between 2012 and 2013, determine the five most common journals and the total articles for each. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal . You will need to do considerable data cleaning in order to extract accurate estimates. For a real bonus round, identify the open access prices paid by subject area.

In [2]:
wellcome = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='latin1')
print(wellcome.dtypes)
wellcome.columns = ['id', 'publisher', 'journal', 'title', 'cost']
wellcome.head()

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object


Unnamed: 0,id,publisher,journal,title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [3]:
wellcome.describe()

Unnamed: 0,id,publisher,journal,title,cost
count,1928,2127,2126,2127,2127
unique,1880,299,984,2126,1402
top,In Process,Elsevier,PLoS One,"Exclusive breastfeeding, diarrhoel morbidity a...",£2040.00
freq,7,387,92,2,94


In [4]:
# Ensure every element in the DataFrame is a string
wellcome = wellcome.applymap(str)
wellcome2 = wellcome
# Remove all whitespace and maximize proportion of alphanumeric characters
wellcome = wellcome.applymap(lambda item: ''.join(item.split(' ')))
wellcome = wellcome.applymap(lambda item: ''.join(item.split('\n')))
wellcome = wellcome.applymap(lambda item: item.replace(':', ''))
wellcome = wellcome.applymap(lambda item: item.replace('(', ''))
wellcome = wellcome.applymap(lambda item: item.replace(')', ''))
wellcome = wellcome.applymap(lambda item: item.replace(',', ''))
wellcome = wellcome.applymap(lambda item: item.replace('-', ''))
wellcome = wellcome.applymap(lambda item: item.replace('\n', ''))
wellcome = wellcome.applymap(lambda item: item.replace('[', ''))
wellcome = wellcome.applymap(lambda item: item.replace(']', ''))
wellcome = wellcome.applymap(lambda item: item.replace('/', ''))
wellcome = wellcome.applymap(lambda item: item.replace('?', ''))

# Lowercase every entry
wellcome = wellcome.applymap(lambda item: item.lower())

# Filter out records which have a float64 type NaN as an ID or blank
wellcome = wellcome[wellcome.id.map(lambda item: item != 'nan')]
wellcome = wellcome[wellcome.id.map(lambda item: item != '')]

# Filter ID's which are only made of text
wellcome = wellcome[wellcome.id.map(lambda item: item.isalpha() == False)]

# Remove currency sign from cost variable
wellcome.cost = wellcome.cost.map(lambda item: item.replace('£', ''))
wellcome.cost = wellcome.cost.map(lambda item: item.replace('$', ''))
wellcome = wellcome[wellcome.cost.map(lambda item: item != '')]

# Convert cost variable to Pandas float type
wellcome.cost = wellcome.cost.astype(dtype='float64')

# Remove periods in journal variable
wellcome.journal = wellcome.journal.map(lambda item: item.replace('.', ''))

## Top 5 journals and article count

In [5]:
# Create new variable for journal groupings
journals = wellcome.groupby('journal', as_index=True)

# Display ranking
journals.title.count().nlargest(5)

journal
plosone                         194
journalofbiologicalchemistry     52
neuroimage                       27
nucleicacidsresearch             25
plospathogens                    24
Name: title, dtype: int64

## Journal cost mean, median, and standard deviation

### Mean

In [6]:
journals.mean().sort_values('cost', ascending=False)

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
poned1217947,999999.000000
experimentalcellresearch,999999.000000
expertreviewsinmolecularmedicine,999999.000000
frontiersincognition,999999.000000
geneticsinmedicine,999999.000000
journalofpaediatricurology,999999.000000
molecluar&cellularendocrinology,999999.000000
oxforduniversitypress,999999.000000
thejournalofcognitiveneuroscience,999999.000000
pmedicined1203130,999999.000000


### Median

In [7]:
journals.median().sort_values('cost', ascending=False)

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
molecluar&cellularendocrinology,999999.00
frontiersincognition,999999.00
experimentalcellresearch,999999.00
poned1217947,999999.00
pmedicined1203130,999999.00
journalofpaediatricurology,999999.00
oxforduniversitypress,999999.00
expertreviewsinmolecularmedicine,999999.00
geneticsinmedicine,999999.00
thejournalofcognitiveneuroscience,999999.00


### Standard deviation

In [8]:
journals.std(ddof=0)

Unnamed: 0_level_0,cost
journal,Unnamed: 1_level_1
academyofnutritionanddietetics,0.000000
acschemicalbiology,433.593733
acschemicalneuroscience,0.000000
acsnano,25.250000
actacrystallographicasectiond,0.000000
actacrystallographicasectiondbiologicalcrystallography,1.160000
actacrystallographicasectionfstructuralbiologyandcrystallizationcommunications,11.035000
actacrystallographyd,0.000000
actad,0.000000
actadermatovenereologica,0.000000


## For a real bonus round, identify the open access prices paid by subject area. (unfinished)

In [9]:
# Remove all whitespace and maximize proportion of alphanumeric characters
wellcome2 = wellcome2.applymap(lambda item: ''.join(item.split('\n')))
wellcome2 = wellcome2.applymap(lambda item: item.replace(':', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('(', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace(')', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace(',', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('-', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('\n', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('[', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace(']', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('/', ''))
wellcome2 = wellcome2.applymap(lambda item: item.replace('?', ''))

# Lowercase every entry
wellcome2 = wellcome2.applymap(lambda item: item.lower())

# Filter out records which have a float64 type NaN as an ID or blank
wellcome2 = wellcome2[wellcome2.id.map(lambda item: item != 'nan')]
wellcome2 = wellcome2[wellcome2.id.map(lambda item: item != '')]

# Filter ID's which are only made of text
wellcome2 = wellcome2[wellcome2.id.map(lambda item: item.isalpha() == False)]

# Remove currency sign from cost variable
wellcome2.cost = wellcome2.cost.map(lambda item: item.replace('£', ''))
wellcome2.cost = wellcome2.cost.map(lambda item: item.replace('$', ''))
wellcome2 = wellcome2[wellcome2.cost.map(lambda item: item != '')]

# Convert cost variable to Pandas float type
wellcome2.cost = wellcome2.cost.astype(dtype='float64')

# Remove periods in journal variable
wellcome2.journal = wellcome2.journal.map(lambda item: item.replace('.', ''))

# split elements apart at spaces
wellcome2.journal = wellcome2.journal.map(lambda item: item.split(' '))
wellcome2.title = wellcome2.title.map(lambda item: item.split(' '))