In [4]:
import pandas as pd
import numpy as np
from IPython.display import display

q1 = pd.read_csv("data/2000s.csv", sep=",")

q1.rename(columns={'conferenceTitle':'conferenceData', 'journalTitle':'journalData'}, inplace=True)

## Extract year from issued date:

In [2]:
issuedYear = q1.issued.str.extract("(\d{4}|[0,3-9]\d)", expand=False)
issuedYear = pd.to_numeric(issuedYear, errors='coerce')

In [3]:
def completeYears(year):
    century = 1900 if year > 40 else 2000
    return century + year
issuedYear[issuedYear < 99] = issuedYear[issuedYear < 99].apply(completeYears)

In [4]:
q1['issuedYear'] = issuedYear

In [5]:
issuedMonthText = q1.issued.str.extract("(ene|jan|feb|mar|apr|abr|mai|may|jun|jul|aug|ago|sept|sep|oct|nov|dic|dec)", expand=False)
issuedMonthNumeric = q1.issued.str.extract("\D(\d|1[1-2])\D?$", expand=False)
issuedMonth = issuedMonthText.fillna(issuedMonthNumeric)

In [6]:
issuedMonth = issuedMonth.replace('ene|jan', '1', regex=True)
issuedMonth = issuedMonth.replace('feb', '2')
issuedMonth = issuedMonth.replace('mar', '3')
issuedMonth = issuedMonth.replace('apr|abr', '4', regex=True)
issuedMonth = issuedMonth.replace('mai|may', '5', regex=True)
issuedMonth = issuedMonth.replace('jun', '6')
issuedMonth = issuedMonth.replace('jul', '7')
issuedMonth = issuedMonth.replace('ago|aug', '8', regex=True)
issuedMonth = issuedMonth.replace('sept|sep', '9', regex=True)
issuedMonth = issuedMonth.replace('oct', '10')
issuedMonth = issuedMonth.replace('nov', '11')
issuedMonth = issuedMonth.replace('dic|dec', '12', regex=True)
issuedMonth = issuedMonth.fillna('6')

In [7]:
q1['issuedMonth'] = issuedMonth

In [20]:
issuedMonth.unique()

array(['6', '1', '7', '4', '5', '3', '8', '9', '10', '2', '12', '11'], dtype=object)

## Removing misstyped journal names

In [2]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [34]:
q1.journalData = q1.journalIssn.map(q1.groupby('journalIssn').journalData.agg(lambda x:x.value_counts().index[0]).to_dict())

In [36]:
q1.journalIssn = q1.journalData.map(q1.groupby('journalData').journalIssn.agg(lambda x:x.value_counts().index[0]).to_dict())

In [35]:
q1.groupby('journalIssn').journalData.value_counts()

journalIssn  journalData                                                                                                                    
0002-1407    Journal of the Agricultural Chemical Society of Japan (Japan)                                                                       42
0003-6862    Applied Entomology and Zoology (Japan)                                                                                              48
0011-1848    Japanese Journal of Crop Science (Japan)                                                                                            75
0013-7626    Journal of the Japanese Society for Horticultural Science (Japan)                                                                   73
0015-6426    Journal of the Food Hygienic Society of Japan (Japan)                                                                               44
0016-531X    Gayana (Chile)                                                                                            

In [37]:
q1.groupby('journalData').journalIssn.value_counts()

journalData                                                                                                                                                                                  journalIssn
"Bulletin of Tokushima Prefectural Agriculture, Forestry and Fisheries Technology Center Fisheries Research Institute (Japan)."                                                              1347-2763        5
"Bulletin of the Agricultural Experiment Station, Okayama Prefectural General Agriculture Center (Japan)."                                                                                   1346-6658       12
"Bulletin of the Horticultural Institute, Ibaraki Agricultural Center (Japan)."                                                                                                              0919-4975        4
"Memoirs of the Graduate School of Fisheries Sciences, Hokkaido University (Japan)."                                                                                           

## Merging with authors

In [2]:
authors = pd.read_csv("data/2000s_authors.csv", sep=",")

In [3]:
authors.authorNames = authors.authorNames.str.split('|')

In [10]:
q1 = pd.merge(q1, authors, on='articleId', how='left')

## Merging with subjects

In [11]:
subjects = pd.read_csv("data/2000s_subjects.csv", sep=",")

In [12]:
import re
        
subjects.subjects = subjects.subjects.apply(lambda s: re.findall("(c_\d+)", s))

In [13]:
q1 = pd.merge(q1, subjects, on='articleId', how='left')

In [4]:
authors.to_csv('data/2000s_authors_clean.csv')