In [74]:
import pandas as pd
import numpy as np
from IPython.display import display

q1 = pd.read_csv("data/2000s.csv", sep=",")

q1.rename(columns={'conferenceTitle':'conferenceData', 'journalTitle':'journalData'}, inplace=True)

## Extract year from issued date:

In [75]:
issuedYear = q1.issued.str.extract("(\d{4}|[0,3-9]\d)", expand=False)
issuedYear = pd.to_numeric(issuedYear, errors='coerce')

In [76]:
def completeYears(year):
    century = 1900 if year > 40 else 2000
    return century + year
issuedYear[issuedYear < 99] = issuedYear[issuedYear < 99].apply(completeYears)

In [77]:
q1['issuedYear'] = issuedYear

In [78]:
issuedMonthText = q1.issued.str.extract("(ene|jan|feb|mar|apr|abr|mai|may|jun|jul|aug|ago|sept|sep|oct|nov|dic|dec)", expand=False)
issuedMonthNumeric = q1.issued.str.extract("\D(\d|1[1-2])\D?$", expand=False)
issuedMonth = issuedMonthText.fillna(issuedMonthNumeric)

In [79]:
issuedMonth = issuedMonth.replace('ene|jan', '1', regex=True)
issuedMonth = issuedMonth.replace('feb', '2')
issuedMonth = issuedMonth.replace('mar', '3')
issuedMonth = issuedMonth.replace('apr|abr', '4', regex=True)
issuedMonth = issuedMonth.replace('mai|may', '5', regex=True)
issuedMonth = issuedMonth.replace('jun', '6')
issuedMonth = issuedMonth.replace('jul', '7')
issuedMonth = issuedMonth.replace('ago|aug', '8', regex=True)
issuedMonth = issuedMonth.replace('sept|sep', '9', regex=True)
issuedMonth = issuedMonth.replace('oct', '10')
issuedMonth = issuedMonth.replace('nov', '11')
issuedMonth = issuedMonth.replace('dic|dec', '12', regex=True)
issuedMonth = issuedMonth.fillna('6')

In [80]:
q1['issuedMonth'] = issuedMonth

In [20]:
issuedMonth.unique()

array(['6', '1', '7', '4', '5', '3', '8', '9', '10', '2', '12', '11'], dtype=object)

## Removing misstyped journal names

In [50]:
q1.replace('journalData', q1.groupby('journalIssn').journalData.agg(lambda x:x.value_counts().index[0]), inplace='true')

In [2]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [7]:
similar('International scientific-practical conference dedicated to the 30-anniversary of the XXI century','International scientific-practical conference dedicated to the 30-anniversary of the BelRIPP',)

0.9148936170212766

In [52]:
q1.groupby('journalIssn').journalData.value_count()

AttributeError: 'SeriesGroupBy' object has no attribute 'value_count'

## Merging with authors

In [81]:
authors = pd.read_csv("data/2000s_authors.csv", sep=",")

In [82]:
authors.authorNames = authors.authorNames.str.split('|')

In [83]:
q1 = pd.merge(q1, authors, on='articleId', how='left')

## Merging with subjects

In [84]:
subjects = pd.read_csv("data/2000s_subjects.csv", sep=",")

In [85]:
import re
        
subjects.subjects = subjects.subjects.apply(lambda s: re.findall("(c_\d+)", s))

In [86]:
q1 = pd.merge(q1, subjects, on='articleId', how='left')

In [87]:
q1

Unnamed: 0,articleId,issued,language,submitted,conferenceData,journalData,journalIssn,issuedYear,issuedMonth,authorNames,subjects
0,LV2006000315,2005,eng,2006,,,,2005,6,,"[c_4788, c_7377, c_6732, c_24056, c_36930, c_2..."
1,LV2006000415,2005,eng,2006,,,,2005,6,,"[c_2350, c_2488, c_8673, c_28787, c_24008, c_1..."
2,BY2001000273,2001,rus,2001,International scientific-practical conference ...,Vestsi Akadehmii agrarnykh navuk Rehspubliki B...,0321-1657,2001,6,,"[c_542, c_33112, c_5978, c_2491, c_3146, c_853..."
3,BY2001000273,2001,rus,2002,International scientific-practical conference ...,Vestsi Akadehmii agrarnykh navuk Rehspubliki B...,0321-1657,2001,6,,"[c_542, c_33112, c_5978, c_2491, c_3146, c_853..."
4,CL2001000192,ene-mar2001,esp,2001,51 Congreso de la Sociedad Agronomica de Chile...,Agricultura Tecnica (Chile),0365-2807,2001,1,,"[c_1548, c_6204, c_8283, c_26808, c_3952]"
5,CL2001000498,jul-sep2001,esp,2001,24. Reunion Anual. Sociedad Chilena de Producc...,Agricultura Tecnica (Chile),0365-2807,2001,7,,"[c_8504, c_26767, c_15998, c_429, c_7917, c_84..."
6,CL2001000507,abr-jun2001,esp,2001,"50. Congreso Agronomico, Pucon (Chile), 8-12 N...",Agricultura Tecnica (Chile),0365-2807,2001,4,,"[c_6251, c_6400, c_49985, c_5956, c_7951, c_8157]"
7,CL2001000515,abr-jun2001,esp,2001,"51. Congreso Agronomico, Talca (Chile), 7-11 N...",Agricultura Tecnica (Chile),0365-2807,2001,4,,"[c_4829, c_1548, c_6202, c_34762, c_26808]"
8,BY2001000166,2001,rus,2001,International scientific-practical conference ...,,,2001,6,,"[c_542, c_16196, c_8347, c_5978, c_3566, c_33237]"
9,BY2001000167,2001,rus,2001,International scientific-practical conference ...,,,2001,6,,"[c_542, c_16196, c_33112, c_5978, c_5739, c_64..."
