### Cleaning Dirty Data

In [197]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [198]:
#to overcome the unicode that cant be converted we used unicode escapeunder encoding
apc_data = pd.read_csv('apcspend.csv', encoding='unicode_escape')

In [199]:
apc_data.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [200]:
apc_data.tail()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
2122,2901593,Wolters Kluwer Health,Circulation Research,Mechanistic Links Between Na+ Channel (SCN5A) ...,£1334.15
2123,3748854,Wolters Kluwer Health,AIDS,Evaluation of an empiric risk screening score ...,£1834.77
2124,3785148,Wolters Kluwer Health,Pediatr Infect Dis J,Topical umbilical cord care for prevention of ...,£1834.77
2125,PMCID:\n PMC3647051\n,Wolters Kluwer N.V./Lippinott,AIDS,Grassroots Community Organisations' Contributi...,£2374.52
2126,PMID: 23846567 (Epub July 2013),Wolters Kluwers,Journal of Acquired Immune Deficiency Syndromes,A novel community health worker tool outperfor...,£2034.75


In [201]:
apc_data.columns

Index(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title',
       'COST (£) charged to Wellcome (inc VAT when charged)'],
      dtype='object')

In [202]:
apc_data = apc_data.rename(columns = {'PMID/PMCID':'pmid_pmcid', 'Publisher':'publisher', 'Journal title':'journal_title','Article title':'article_title', 'COST (£) charged to Wellcome (inc VAT when charged)':'cost'})

In [203]:
apc_data.head()

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [204]:
apc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
pmid_pmcid       1928 non-null object
publisher        2127 non-null object
journal_title    2126 non-null object
article_title    2127 non-null object
cost             2127 non-null object
dtypes: object(5)
memory usage: 83.2+ KB


In [205]:
apc_data['pmid_pmcid']

0                                    NaN
1                             PMC3679557
2                  23043264  PMC3506128 
3                    23438330 PMC3646402
4                   23438216 PMC3601604 
5                             PMC3579457
6                             PMC3709265
7                   23057412 PMC3495574 
8                      PMCID: PMC3780468
9                      PMCID: PMC3621575
10                     PMCID: PMC3739413
11                     PMCID: PMC3530961
12                     PMCID: PMC3624797
13                            PMC3413243
14                            PMC3694353
15                            PMC3572711
16                              22610094
17                     PMCID: PMC3586974
18          23455506  PMCID: PMC3607399 
19            PMID: 24015914 PMC3833349 
20                         : PMC3805332 
21                                   NaN
22              PMCID:\n    PMC3656742\n
23                        PMCID: 3584654
24              

In [206]:
apc_data['pmid_pmcid'].unique()

array([nan, 'PMC3679557', '23043264  PMC3506128 ', ..., '3748854',
       '3785148', 'PMID: 23846567  (Epub July 2013)'], dtype=object)

In [207]:
apc_data.head()

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [208]:
#Turn the publisher column into capital letters
apc_data['publisher'] = apc_data['publisher'].str.upper()

In [209]:
# counts to know frequency
apc_data['publisher'].value_counts()

ELSEVIER                                                            391
PUBLIC LIBRARY OF SCIENCE                                           278
WILEY                                                               136
SPRINGER                                                             81
OXFORD UNIVERSITY PRESS                                              79
OUP                                                                  56
WILEY-BLACKWELL                                                      56
BIOMED CENTRAL                                                       51
NATURE PUBLISHING GROUP                                              47
ASBMB                                                                46
BMC                                                                  26
PLOS                                                                 24
NATURE                                                               24
FRONTIERS                                                       

In [249]:
#the unique shape of the publisher column
apc_data['publisher'].unique().shape

(275,)

In [250]:
#strip the data of all unnecessary spaces and new line charater
#apc_data.loc[:, 'publisher'].str.strip().unique().shape


In [252]:
apc_data['publisher']\
.str.replace('(^.*ACS.*$)', 'AMERICAN CHEMICAL SOCIETY')\
.str.replace('(^.*AMERICAN.SOCIETY.FOR.BIO.*$)','AMERICAN SOCIETY FOR BIOCHEMISTRY AND MOLECULAR BIOLOGY')\
.str.replace('(^.*AMERICAN.SOC\s.*$)', 'AMERICAN SOCIETY FOR BIOCHEMISTRY AND MOLECULAR BIOLOGY' )


(267,)

In [192]:
apc_data[apc_data.publisher=='JOURNAL OF THE AMERICAN PHYSIOLOGICAL PROCEEDINGS OF NATIONAL ACADEMY OF SCIENCES']

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
960,PMID23213218 PMC3529057,JOURNAL OF THE AMERICAN PHYSIOLOGICAL PROCEEDI...,National Academy of Sciences,Morphing between expressions dissociates conti...,£1052.99
