In [1]:
import pyspark
import os
import math
import random
import sys
import pandas as pd
import numpy as np
from IPython.display import display
import re

# make sure pyspark tells workers to use python2 not 3 if both are installed\\n\",\n",
os.environ["PYSPARK_PYTHON"] = "python2"
os.environ['PYTHONPATH'] = ':'.join(sys.path)

import pyspark
from pyspark.sql import SQLContext

sc = pyspark.SparkContext('local[*]')
sqlCtx = SQLContext(sc)

In [2]:
from difflib import SequenceMatcher
# https://docs.python.org/2.4/lib/sequence-matcher.html

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [3]:
df1 = pd.read_csv("data/0M-1M.csv", sep=",",header=0,low_memory=False)
df2 = pd.read_csv("data/1M-2M.csv", sep=",",header=0,low_memory=False)
df3 = pd.read_csv("data/2M-3M.csv", sep=",",header=0,low_memory=False)
df4 = pd.read_csv("data/3M-4M.csv", sep=",",header=0,low_memory=False)
df5 = pd.read_csv("data/4M-5M.csv", sep=",",header=0,low_memory=False)
df6 = pd.read_csv("data/5M-6M.csv", sep=",",header=0,low_memory=False)

df_ = pd.concat([df1,df2,df3,df4,df5,df6],ignore_index = True)

articles = df_.sample(n=10000)
articles.to_csv('data/current_sample.csv')
articles.sort_index(inplace=True)

In [4]:
df1 = pd.read_csv("data/Authors-0M-1M.csv", sep=",",header=0,low_memory=False)
df2 = pd.read_csv("data/Authors-1M-2M.csv", sep=",",header=0,low_memory=False)
df3 = pd.read_csv("data/Authors-2M-3M.csv", sep=",",header=0,low_memory=False)
df4 = pd.read_csv("data/Authors-3M-4M.csv", sep=",",header=0,low_memory=False)
df5 = pd.read_csv("data/Authors-4M-5M.csv", sep=",",header=0,low_memory=False)
df6 = pd.read_csv("data/Authors-5M-6M.csv", sep=",",header=0,low_memory=False)

df_Authors_ = pd.concat([df1,df2,df3,df4,df5,df6],ignore_index = True)

df1 = pd.read_csv("data/Sub-0M-1M.csv", sep=",",header=0,low_memory=False)
df2 = pd.read_csv("data/Sub-1M-2M.csv", sep=",",header=0,low_memory=False)
df3 = pd.read_csv("data/Sub-2M-3M.csv", sep=",",header=0,low_memory=False)
df4 = pd.read_csv("data/Sub-3M-4M.csv", sep=",",header=0,low_memory=False)
df5 = pd.read_csv("data/Sub-4M-5M.csv", sep=",",header=0,low_memory=False)

df_Subjects = pd.concat([df1,df2,df3,df4,df5],ignore_index = True)

In [5]:
articles.rename(columns={'conferenceTitle':'conferenceData', 'journalTitle':'journalData'}, inplace=True)

## Extract conference information:

We want to classify all the conferences, setting the same ID in the same conferences.

Also from conferences, we are interested in the year, in order to compare it with the other years in the data.

In [6]:
# Tenen conferencia
len(articles[articles.conferenceData.notnull()])

265

In [7]:
conferenceDF =  pd.DataFrame(articles.conferenceData)

### Get the year from the data
First, we validate if all the dates have a year

In [8]:
# Que els quatre digits seguits siguin any
# Primer miro si hi han mes de un grup de 4 digits seguits ens les dades
pattern = r'^.*\d{4}.*\d{4}.*$'
np.sum(conferenceDF.conferenceData[conferenceDF.conferenceData.notnull()].str.contains(pattern,na=False))

10

In [9]:
# Si n'hi ha, mirem que siguin el mateix any. Mostrem els que son diferents valors. Aquetst els despreciarem
pattern = r'^.*(\d{4}).*(\d{4}).*$'
moreThanOne = conferenceDF.conferenceData[conferenceDF.conferenceData.notnull()].str.extract(pattern,expand=False)
moreThanOneDifferents = moreThanOne[moreThanOne[0].notnull() & (moreThanOne[0] != moreThanOne[1])]

In [10]:
moreThanOneDifferents.index

Int64Index([2838736], dtype='int64')

In [11]:
# Si el resultat és 0 també s'ha de comprovar que totes tinguin 4 digits seguits
pattern = r'^.*\d{4}.*$'
len(conferenceDF.conferenceData[conferenceDF.conferenceData.notnull()]) - np.sum(conferenceDF.conferenceData[conferenceDF.conferenceData.notnull()].str.contains(pattern,na=False))

2

In [12]:
# Busquem les que no tenen els anys amb format correcte
badYearFormat = conferenceDF.conferenceData[conferenceDF.conferenceData.notnull()].str.contains(pattern,na=False)
badYearFormat = badYearFormat[badYearFormat == False]
conferenceDF.conferenceData[conferenceDF.conferenceData.index.isin(badYearFormat.index)]

3241085                                    RIZA.
5072815    Western Cotton Production Conference.
Name: conferenceData, dtype: object

In [13]:
# Obtenim l'any i eliminem els que hi han a moreThanOneDifferents
pattern = r'^.*(\d{4}).*$'
conferenceDF['conferenceDateYear'] = conferenceDF.conferenceData.str.extract(pattern, expand=False)
conferenceDF.conferenceDateYear[conferenceDF.conferenceDateYear.index.isin(moreThanOneDifferents.index)] = np.nan
conferenceDF.conferenceDateYear = pd.to_numeric(conferenceDF.conferenceDateYear, errors='coerce')
len(conferenceDF.conferenceDateYear[conferenceDF.conferenceDateYear.notnull()])

262

In [14]:
articles["conferenceYear"] = conferenceDF.conferenceDateYear

## Extract year from issued date:

In [15]:
issuedYear = articles.issued.str.extract("(\d{4}|[0,3-9]\d)", expand=False)
issuedYear = pd.to_numeric(issuedYear, errors='coerce')

In [16]:
def completeYears(year):
    century = 1900 if year > 40 else 2000
    return century + year
issuedYear[issuedYear < 99] = issuedYear[issuedYear < 99].apply(completeYears)

In [17]:
articles['issuedYear'] = issuedYear

In [18]:
issuedMonthText = articles.issued.str.extract("(ene|jan|feb|mar|apr|abr|mai|may|jun|jul|aug|ago|sept|sep|oct|nov|dic|dec)", expand=False)
issuedMonthNumeric = articles.issued.str.extract("\D(\d|1[1-2])\D?$", expand=False)
issuedMonth = issuedMonthText.fillna(issuedMonthNumeric)

In [19]:
issuedMonth = issuedMonth.replace('ene|jan', '1', regex=True)
issuedMonth = issuedMonth.replace('feb', '2')
issuedMonth = issuedMonth.replace('mar', '3')
issuedMonth = issuedMonth.replace('apr|abr', '4', regex=True)
issuedMonth = issuedMonth.replace('mai|may', '5', regex=True)
issuedMonth = issuedMonth.replace('jun', '6')
issuedMonth = issuedMonth.replace('jul', '7')
issuedMonth = issuedMonth.replace('ago|aug', '8', regex=True)
issuedMonth = issuedMonth.replace('sept|sep', '9', regex=True)
issuedMonth = issuedMonth.replace('oct', '10')
issuedMonth = issuedMonth.replace('nov', '11')
issuedMonth = issuedMonth.replace('dic|dec', '12', regex=True)
issuedMonth = issuedMonth.fillna('6')

In [20]:
articles['issuedMonth'] = issuedMonth

In [21]:
issuedMonth[issuedMonth.isnull() == True].count()

0

In [22]:
issuedMonth[issuedYear.isnull() == True].count()

20

## Check correctness of issued, submitted year and conference year

In [23]:
articles.ix[articles.submitted - articles.issuedYear < 0, 'issuedYear'] = articles[articles.submitted - articles.issuedYear < 0].submitted

In [24]:
articles[articles.submitted - articles.issuedYear < 0].count()

articleId         0
issued            0
language          0
submitted         0
conference        0
conferenceData    0
journal           0
journalData       0
journalIssn       0
conferenceYear    0
issuedYear        0
issuedMonth       0
dtype: int64

In [25]:
# Check correctness of conference year
articles.conferenceYear[(articles.conferenceYear < 1970) | (articles.conferenceYear > 2020)]

Series([], Name: conferenceYear, dtype: float64)

In [26]:
articles.articleId[articles.conferenceYear.notnull() & (articles.conferenceYear > articles.submitted)].count()

0

## Removing misstyped journal names

In [27]:
articles.journalData = articles.journalIssn.map(articles.groupby('journalIssn').journalData.agg(lambda x:x.value_counts().index[0]).to_dict())

In [28]:
articles.journalIssn = articles.journalData.map(articles.groupby('journalData').journalIssn.agg(lambda x:x.value_counts().index[0]).to_dict())

In [29]:
articles.groupby('journalIssn').journalData.value_counts()

journalIssn  journalData                                                                                                       
0001-1541    AIChE journal                                                                                                          2
0001-2351    Transactions of the ASAE (USA)                                                                                        10
0001-3137    Abeille de France                                                                                                      1
0001-3943    Bulletin of the Institute of Zoology, Academia Sinica                                                                  1
0001-4087    Bulletin de l'Academie polonaise des sciences. Serie des sciences biologiques                                          1
0001-5296    Acta biologica Cracoviensia. Series Botanica                                                                           1
0001-530X    Acta biologica Cracoviensia. Serie: Zoologie           

In [30]:
articles.groupby('journalData').journalIssn.value_counts()

journalData                                                                                                journalIssn
ACS symposium series (USA)                                                                                 0097-6156       4
AGRIS on-line Papers in Economics and Informatics                                                          1804-1930       2
AIChE journal                                                                                              0001-1541       2
Abeille de France                                                                                          0001-3137       1
Acta Agraria et Silvestria. Series Agraria                                                                 0065-0919       1
Acta Agronomica Sinica                                                                                     0496-3490       2
Acta Amazonica                                                                                             0044-5967       4
Acta B

## Languages correctness

In [31]:
articles.groupby('language').language.value_counts()

language  language
Eng       Eng            1
FRE       FRE            1
afr       afr            5
ara       ara           10
bem       bem            3
ben       ben            1
bul       bul           48
cat       cat            1
chi       chi           90
cze       cze           49
dan       dan           50
dut       dut           66
eng       eng         6629
esp       esp          310
est       est            2
fin       fin            3
fra       fra           18
fre       fre          470
geo       geo            3
ger       ger          556
gre       gre            1
gua       gua            1
heb       heb            3
hun       hun           34
ind       ind           23
ita       ita          191
jpn       jpn          174
kor       kor            6
lat       lat            1
lav       lav            7
mac       mac            3
mal       mal            1
map       map            1
mul       mul            1
nor       nor           16
per       per            3
phi      

### Set conference Ids

In [32]:
conferenceDF["conferenceId"] = np.nan

In [33]:
def similarConference(a,b):
    if a["conferenceDateYear"] != b["conferenceDateYear"]:
        return False
    return similar(a["conferenceData"],b["conferenceData"]) > 0.6

def conferenceComput(cdf):
    ID = 1
    diff = 0
    addNew = True
    for idx, x in cdf[cdf.conferenceData.notnull()].iterrows():
        if ID == 1:
            cdf.loc[idx,"conferenceId"] = ID
            ID += 1
            continue
        for idy, y in cdf[cdf.conferenceId.notnull()].iterrows():
            if similarConference(x,y):
                #cdf.loc[idx,"conferenceId"] = cdf.loc[idy,"conferenceId"]
                cdf.set_value(idx,"conferenceId",cdf.loc[idy,"conferenceId"])
                addNew = False
                diff += 1
                break
        if addNew:
            cdf.loc[idx,"conferenceId"] = ID
            ID += 1
        addNew = True
    return diff
conferenceComput(conferenceDF)

7

In [34]:
articles["ConferenceId"] = conferenceDF.conferenceId