### Cleaning Dirty Data

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [45]:
#to overcome the unicode that cant be converted we used unicode escapeunder encoding
apc_data = pd.read_csv('apcspend.csv', encoding='unicode_escape')

In [46]:
apc_data.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [47]:
apc_data.tail()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
2122,2901593,Wolters Kluwer Health,Circulation Research,Mechanistic Links Between Na+ Channel (SCN5A) ...,£1334.15
2123,3748854,Wolters Kluwer Health,AIDS,Evaluation of an empiric risk screening score ...,£1834.77
2124,3785148,Wolters Kluwer Health,Pediatr Infect Dis J,Topical umbilical cord care for prevention of ...,£1834.77
2125,PMCID:\n PMC3647051\n,Wolters Kluwer N.V./Lippinott,AIDS,Grassroots Community Organisations' Contributi...,£2374.52
2126,PMID: 23846567 (Epub July 2013),Wolters Kluwers,Journal of Acquired Immune Deficiency Syndromes,A novel community health worker tool outperfor...,£2034.75


In [48]:
apc_data.columns

Index(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title',
       'COST (£) charged to Wellcome (inc VAT when charged)'],
      dtype='object')

In [49]:
apc_data = apc_data.rename(columns = {'PMID/PMCID':'pmid_pmcid', 'Publisher':'publisher', 'Journal title':'journal_title','Article title':'article_title', 'COST (£) charged to Wellcome (inc VAT when charged)':'cost'})

In [50]:
apc_data.head()

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [51]:
apc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
pmid_pmcid       1928 non-null object
publisher        2127 non-null object
journal_title    2126 non-null object
article_title    2127 non-null object
cost             2127 non-null object
dtypes: object(5)
memory usage: 83.2+ KB


In [52]:
apc_data['pmid_pmcid']

0                                    NaN
1                             PMC3679557
2                  23043264  PMC3506128 
3                    23438330 PMC3646402
4                   23438216 PMC3601604 
5                             PMC3579457
6                             PMC3709265
7                   23057412 PMC3495574 
8                      PMCID: PMC3780468
9                      PMCID: PMC3621575
10                     PMCID: PMC3739413
11                     PMCID: PMC3530961
12                     PMCID: PMC3624797
13                            PMC3413243
14                            PMC3694353
15                            PMC3572711
16                              22610094
17                     PMCID: PMC3586974
18          23455506  PMCID: PMC3607399 
19            PMID: 24015914 PMC3833349 
20                         : PMC3805332 
21                                   NaN
22              PMCID:\n    PMC3656742\n
23                        PMCID: 3584654
24              

In [53]:
apc_data['pmid_pmcid'].unique()

array([nan, 'PMC3679557', '23043264  PMC3506128 ', ..., '3748854',
       '3785148', 'PMID: 23846567  (Epub July 2013)'], dtype=object)

In [54]:
apc_data.head()

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [56]:
#Turn the publisher column into capital letters
apc_data['publisher'] = apc_data['publisher'].str.upper()

In [62]:
# counts to know frequency
apc_data['publisher'].value_counts()

ELSEVIER                                                            391
PUBLIC LIBRARY OF SCIENCE                                           278
WILEY                                                               136
SPRINGER                                                             81
OXFORD UNIVERSITY PRESS                                              79
OUP                                                                  56
WILEY-BLACKWELL                                                      56
BIOMED CENTRAL                                                       51
NATURE PUBLISHING GROUP                                              47
ASBMB                                                                46
BMC                                                                  26
PLOS                                                                 24
NATURE                                                               24
FRONTIERS                                                       

In [64]:
#the unique shape of the publisher column
apc_data['publisher'].unique().shape

(275,)

In [92]:
apc_data['publisher']\
.str.replace('WOLTERS.\w*.\w*', 'WOLTERS KLUWER').str.rstrip().str.replace('.V./LIPPINOTT', "")\
.str.replace('WILEY[-\s]\w*.\w*.\w*.\w*', 'WILEY').str.strip().str.replace('WLIEY-BLACKWELL', 'WILEY').str.replace('WILEY/BLACKWELL', 'WILEY').str.replace('WILEY & SONS', 'WILEY').str.replace('WILEY.', 'WILEY')\
.str.replace('THE\sEND\w*.\w*', 'THE ENDOCRINE SOCIETY')\
.str.replace('THE\sCOMPANY\s.w*.\w*.\w*.\w*', 'THE COMPANY OF BIOLOGISTS')\
.str.replace(', INC', '')\
.str.replace('TAYLOR.\w*.\w*.\w*.\w*', 'TAYLOR & FRANCIS').str.replace('T&F', 'TAYLOR & FRANCIS')\
.str.replace('SPRINGER[-\s]\w*.\w*.\w*.\w*.\w*.\w*.\w*.\w*.\w*.', 'SPRINGER').str.replace('SPRINGERBERLIN', 'SPRINGER').str.replace('SPRINGER VERLAG', 'SPRINGER')\
.str.replace('SOCIETY\s(FOR|OF)\sNE.\w*.\w*.', 'SOCIETY FOR NEUROSCIENCES')\
.str.replace('SOCIETY\s(FOR|OF)\sGEN.\w*.\w*.\w*', 'SOCIETY FOR GENERAL MICROBIOLOGY')\
.str.replace('SAGE\sPUB\w*.\w*.\w*.\w*.\w*.\w*.\w*.\w*.', 'SAGE').str.replace('SAGE PUBLISHERS', 'SAGE').str.replace('SAGE PUBLISHING', 'SAGE').unique()




array(['CUP', 'ACS', 'ACS (AMERCIAN CHEMICAL SOCIETY) PUBLICATIONS',
       'ACS PUBLICATIONS', 'AGA INSTITUTE', 'AMBSB',
       'AMERICAN ASSOCIATION OF IMMUNOLOGISTS',
       'AMERICAN CHEMICAL SOCIETY',
       'AMERICAN CHEMICAL SOCIETY PUBLICATIONS',
       'AMERICAN COLLEGE OF CHEST PHYSICIANS',
       'AMERICAN PHYSIOLOGICAL SOCIETY',
       'AMERICAN PSYCHIATRIC ASSOCIATION',
       'AMERICAN PSYCHIATRIC PUBLISHING',
       'AMERICAN PSYCHOLOGICAL ASSOCIATION',
       'AMERICAN PUBLIC HEALTH ASSOCIATION',
       'AMERICAN SOC FOR BIOCHEMISTRY AND MOLECULAR BIOLOGY',
       'AMERICAN SOCIETY FOR BIOCHEMISTRY AND MOLECULAR BIOLGY',
       'AMERICAN SOCIETY FOR BIOCHEMISTRY AND MOLECULAR BIOLOGY',
       'AMERICAN SOCIETY FOR INVESTIGATIVE PATHOLOGY',
       'AMERICAN SOCIETY FOR MICROBIOLOGY',
       'AMERICAN SOCIETY FOR NUTRITION',
       'AMERICAN SOCIETY OF HAEMATOLOGY',
       'AMERICAN SOCIETY OF HAMATOLOGY', 'AMERICAN SOCIETY OF HEMATOLOGY',
       'AMERICAN SOCIETY OF HUMA

In [89]:
apc_data[apc_data.publisher=='SCIEDU PRESS']

Unnamed: 0,pmid_pmcid,publisher,journal_title,article_title,cost
1676,,SCIEDU PRESS,International Journal of Financial Research\n,Determinants of Enrolment in Voluntary Health ...,£187.03
1677,,SCIEDU PRESS,Journal of Biomedical Graphics and Computing,Functional MRI demonstrates pain perception in...,£135.29
