# Merge and clean the InCites files


Notebook to process the various InCites files


In [None]:
import os
import pandas as pd

maindir = '../data/inCites'
incites_files = {}
journals = {}
for root, dirs, files in os.walk(maindir):
    if root == maindir:
        disciplines = dirs
        continue
    d = root.split('/')[-1]
    incites_files[d] = files
    journals[d] = []
    for f in files:
        journals[d].append(f.split('#')[0])
    journals[d] = set(journals[d])

#csvf = ['Open-Access-OA-', 'Contributions-by-country-region', 'Contributions-by-organizations']
csvf = ['Open-Access-OA-', 'Contributions-by-country-region']
year = '2019'
scie = 'SCIE'
nrows_toskip = 5

In [None]:
dfs = {}
for ft in csvf:
    dfs[ft] = []
    for d,j in journals.items():
        dfs[d] = []
        for f in j:
            filepath = f'{maindir}/{d}/{f}#{year}#{scie}#{ft}.csv'
            df = pd.read_csv(filepath, skiprows=nrows_toskip).dropna()
            df['Journal'] = f
            if ft == 'Open-Access-OA-':
                df = df.set_index('Unnamed: 0').drop(['% Citable Open Access'])
                df['Items'] = df['Items'].astype('int')
                df['Citations*'] = df['Citations*'].astype('int')
                newd = {'TotItems': df.loc['Total Citable', 'Items'],
                    'TotCitations': df.loc['Total Citable', 'Citations*'],
                    'OAItems': df.loc['Gold OA Citable', 'Items'],
                    'OACitations': df.loc['Gold OA Citable', 'Citations*'],
                    'NotOAItems': df.loc['Subscription and Free to Read Citable', 'Items'],
                    'NotOACitations': df.loc['Subscription and Free to Read Citable', 'Citations*']}
                df = pd.DataFrame(newd, index=[f])
                df['CitPerItem_OA'] = df.OACitations/df.OAItems
                df['CitPerItem_NotOA'] = df.NotOACitations/df.NotOAItems
            if ft == 'Contributions-by-country-region':
                df = df.drop(columns=['Rank']).pivot(index='Journal', values='Count', columns='Country')
                df['Discipline'] = d
            dfs[ft].append(df)

In [None]:
#Check concat operation

#ft='Open-Access-OA-'
ft='Contributions-by-country-region'

t1 = dfs[ft][0]
t2 = dfs[ft][1]

df = pd.concat([t1,t2], axis=0)

i1 = set(t1.columns)
i2 = set(t2.columns)

diff1 = i1-i2
diff2 = i2-i1

diff = list(diff1)+list(diff2)

df[diff]

In [None]:
# Join
dfc = pd.concat(dfs[csvf[1]], axis=0).fillna(0).sort_index(axis=1)
dfc['Discipline'] = dfc.pop('Discipline')
dfoa = pd.concat(dfs[csvf[0]], axis=0)

df1 = pd.merge(dfc, dfoa, left_index=True, right_index=True)
df1['TotAuthors'] = dfc.sum(axis=1, numeric_only = True)

# Find groupA and groupB countries
countries = list(dfc.columns)
countries.remove('Discipline')
countries.sort()
df = pd.DataFrame({'GroupA': 0, 'GroupB': 0}, index=countries)
with open('../data/groupA_low_income.txt', 'r') as ga:
    groupA = ga.read().splitlines()
with open('../data/groupB_lower_middle_income.txt', 'r') as ga:
    groupB = ga.read().splitlines()
groupA.remove('Côte d’Ivoire')
groupA.append('Cote Ivoire')
groupA.remove('Korea, Democratic People’s Republic of')
groupA.append('North Korea')
groupA.remove('Moldova (Republic of)')
groupA.append('Moldova')
groupA.sort()

def assign_group(row):
    if row.name in groupA:
        row.GroupA = 1
    elif row.name in groupB:
        row.GroupB = 1
    return row

dfcg = df.apply(assign_group, axis='columns')

df1.to_csv('../data/inCites/dataframe.csv')
dfc.to_csv('../data/inCites/dataframe_only_countries.csv')
dfoa.to_csv('../data/inCites/dataframe_only_openaccess.csv')
dfcg.to_csv('../data/inCites/countries_groupAB.csv')

In [None]:
#filepath '../data/inCites/physics/PHYS-LETT-B#2019#SCIE#Contributions-by-country-region.csv'
#test = pd.read_csv(filepath, skiprows=nrows_toskip).dropna()
#t1 = dfc.loc['PHYS-LETT-B']
#t2 = dfoa.loc['PHYS-LETT-B']