In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding = 'ISO-8859-1')
raw_data.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


## Determine the five most common journals and the total articles for each

In [3]:
#put this into a dataframe and rename the columns to callable names w/ consistent format
df = raw_data.rename(columns={'PMID/PMCID':'pmid_pmcid',
                              'Publisher':'publisher',
                              'Journal title': 'journal_title',
                              'Article title':'article_title',
                              'COST (£) charged to Wellcome (inc VAT when charged)':'cost'})
df.head(10)

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,Journal of Medicinal Chemistry,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,Journal of Proteome Research,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,Mol Pharm,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,Chemical proteomic analysis reveals the drugab...,£1294.78


In [4]:
#lowercase and strip values for consistency
df['pmid_pmcid'] = df['pmid_pmcid'].str.lower()
df['pmid_pmcid'] = df['pmid_pmcid'].str.strip()
df['publisher'] = df['publisher'].str.lower()
df['publisher'] = df['publisher'].str.strip()
df['journal_title'] = df['journal_title'].str.lower()
df['journal_title'] = df['journal_title'].str.strip()
df['article_title'] = df['article_title'].str.lower()
df['article_title'] = df['article_title'].str.strip()
df['cost'] = df['cost'].str.lower()
df['cost'] = df['cost'].str.strip()

df.head()

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,cup,psychological medicine,reduced parahippocampal cortical thickness in ...,£0.00
1,pmc3679557,acs,biomacromolecules,structural characterization of a model gram-ne...,£2381.04
2,23043264 pmc3506128,acs,j med chem,"fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 pmc3646402,acs,j med chem,orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 pmc3601604,acs,j org chem,regioselective opening of myo-inositol orthoes...,£685.88


In [5]:
#find the 5 most common journals
top_five_journals = df.journal_title.value_counts().head(5)
top_five_journals

plos one                           190
journal of biological chemistry     53
neuroimage                          29
nucleic acids research              26
plos pathogens                      24
Name: journal_title, dtype: int64

In [6]:
#find the number of articles for each
plos_one = df.loc[lambda df: df['journal_title'] == 'plos one',:]
plos_one_articles = plos_one.groupby('journal_title')['article_title'].nunique()
print(plos_one_articles, '\n')

journal_of_biological_chemistry = df.loc[lambda df: df['journal_title'] == 'journal of biological chemistry',:]
journal_of_biological_chemistry_articles = journal_of_biological_chemistry.groupby('journal_title')['article_title'].nunique()
print(journal_of_biological_chemistry_articles, '\n')

neuroimage = df.loc[lambda df: df['journal_title'] == 'neuroimage',:]
neuroimage_articles = neuroimage.groupby('journal_title')['article_title'].nunique()
print(neuroimage_articles, '\n')

nucleic_acids_research = df.loc[lambda df: df['journal_title'] == 'nucleic acids research',:]
nucleic_acids_research_articles = nucleic_acids_research.groupby('journal_title')['article_title'].nunique()
print(nucleic_acids_research_articles, '\n')

plos_genetics = df.loc[lambda df: df['journal_title'] == 'plos genetics',:]
plos_genetics_articles = plos_genetics.groupby('journal_title')['article_title'].nunique()
print(plos_genetics_articles)

journal_title
plos one    189
Name: article_title, dtype: int64 

journal_title
journal of biological chemistry    53
Name: article_title, dtype: int64 

journal_title
neuroimage    29
Name: article_title, dtype: int64 

journal_title
nucleic acids research    26
Name: article_title, dtype: int64 

journal_title
plos genetics    24
Name: article_title, dtype: int64


## Calculate the mean, median, and std dev for the open access cost per article for each journal

In [7]:
plos_one['cost'] = plos_one['cost'].str.replace('£', '') #get rid of currency sign
plos_one['cost'] = plos_one['cost'].astype(float) #convert str to float
plos_one = plos_one[plos_one['cost'] <= 9999] #remove extreme outliers

#perform the same for the remaining 4 journals
journal_of_biological_chemistry['cost'] = journal_of_biological_chemistry['cost'].str.replace('£', '')
journal_of_biological_chemistry['cost'] = journal_of_biological_chemistry['cost'].astype(float)
journal_of_biological_chemistry = journal_of_biological_chemistry[journal_of_biological_chemistry['cost'] <= 9999]

neuroimage['cost'] = neuroimage['cost'].str.replace('£', '')
neuroimage['cost'] = neuroimage['cost'].astype(float)
neuroimage = neuroimage[neuroimage['cost'] <= 9999]

nucleic_acids_research['cost'] = nucleic_acids_research['cost'].str.replace('£', '')
nucleic_acids_research['cost'] = nucleic_acids_research['cost'].astype(float)
nucleic_acids_research = nucleic_acids_research[nucleic_acids_research['cost'] <= 9999]

plos_genetics['cost'] = plos_genetics['cost'].str.replace('£', '')
plos_genetics['cost'] = plos_genetics['cost'].astype(float)
plos_genetics = plos_genetics[plos_genetics['cost'] <= 9999]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

In [12]:
print('Plos One')
print('Mean is: ', plos_one['cost'].mean())
print('Mode is: ', plos_one['cost'].mode())
print('Std dev is: ', plos_one['cost'].std(), '\n')

print('Journal of Biological Chemistry')
print('Mean: ', journal_of_biological_chemistry['cost'].mean())
print('Mode: ', journal_of_biological_chemistry['cost'].mode())
print('Std dev: ', journal_of_biological_chemistry['cost'].std(), '\n')

print('Neuroimage')
print('Mean is: ', neuroimage['cost'].mean())
print('Mode is: ', neuroimage['cost'].mode())
print('Std dev is: ', neuroimage['cost'].std(), '\n')

print('Nucleic Acids Research')
print('Mean is: ', nucleic_acids_research['cost'].mean())
print('Mode is: ', nucleic_acids_research['cost'].mode())
print('Std dev is: ', nucleic_acids_research['cost'].std(), '\n')

print('Plos Genetics')
print('Mean is: ', plos_genetics['cost'].mean())
print('Mode is: ', plos_genetics['cost'].mode())
print('Std dev is: ', plos_genetics['cost'].std(), '\n')

Plos One
Mean is:  934.6558241758228
Mode is:  0    825.68
dtype: float64
Std dev is:  201.8475887334149 

Journal of Biological Chemistry
Mean:  1423.5884615384614
Mode:  0    1276.08
1    1556.61
dtype: float64
Std dev:  411.95435993820547 

Neuroimage
Mean is:  2215.168275862069
Mode is:  0    1762.69
1    2503.34
dtype: float64
Std dev is:  266.65394691928987 

Nucleic Acids Research
Mean is:  1149.0
Mode is:  0    852.0
dtype: float64
Std dev is:  442.9404474644419 

Plos Genetics
Mean is:  1643.1109090909092
Mode is:  0    1456.19
1    1807.60
dtype: float64
Std dev is:  153.36682533527224 

