# Data analysis and visualization

Code for analysis / visualization


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

df = pd.read_csv('../data/inCites/dataframe.csv', index_col=0)
cgab = pd.read_csv('../data/inCites/countries_groupAB.csv', index_col=0)
countries = list(cgab.index)
cgab_gdpord = cgab.sort_values(by=['GDP_pc'])
countries_gdpord = list(cgab_gdpord.index)
#df_c = df[countries]

df['PercOAItems'] = df.OAItems / df.TotItems
df['PercNotOAItems'] = df.NotOAItems / df.TotItems
df['CitR'] = df.CitPerItem_OA/ df.CitPerItem_NotOA
df['CitPerItem'] = df.TotCitations / df.TotItems
df['AvgCit'] = df.groupby('Discipline').TotCitations.transform('mean')
df['AvgCitPerItem'] = df.groupby('Discipline').CitPerItem.transform('mean')

df_citnorm = df[['TotCitations',  'OACitations', 'NotOACitations']].div(df.AvgCit, axis=0).add_suffix('_norm')
df_cpinorm = df[['CitPerItem_OA', 'CitPerItem_NotOA', 'CitPerItem']].div(df.AvgCitPerItem, axis=0).add_suffix('_norm')
df = pd.concat([df, df_citnorm, df_cpinorm], axis=1)
df['CitR_norm'] = df.CitPerItem_OA_norm/ df.CitPerItem_NotOA_norm

# Population in Millions
cgab.Population = cgab.Population / 1000000

In [None]:
test = cgab_gdpord.reset_index()
idx = list(test[test.Group == 'A'].index)


In [None]:
list(df.columns)

In [None]:
# Does the N. of OA option trend with a higher citation rate OA/notOA ?

#sns.scatterplot(data=df, x="PercOAItems", y="CitR", hue="Discipline", palette="deep")
#avg_fullOA = df[df.NotOAItems < 1].groupby('Discipline').CitPerItem_OA.mean()
#avg_notOA = df[df.OAItems < 1].groupby('Discipline').CitPerItem_NotOA.mean()
#pd.concat([avg_fullOA, avg_notOA], axis=1).reset_index().rename(columns={'CitPerItem_OA': 'Full OA', 'CitPerItem_NotOA': 'No OA'})
sns.scatterplot(data=df, x="PercOAItems", y="CitPerItem_norm", hue="Discipline", palette="deep")
t = plt.title("Citations per article vs Journal's percentage of OA articles, normalized by average citations per item in the discipline")
plt.savefig('../outputs/cit-per-item_OA-perc_norm.png')

In [None]:
ax = sns.boxplot(data=df, x='PercOAItems', y='Discipline', color='w', width=0.5, linewidth=2.5, showmeans=True)
ax = sns.swarmplot(data=df, x='PercOAItems', y='Discipline', color='k', size=3)
t = plt.title('Percentage of OA items')
plt.savefig('../outputs/pct_OA_discipline.png')
#Median and quartiles, green triangle is the mean

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,5))
fig.suptitle('Citations vs % of OA items')

sns.scatterplot(ax=axes[0], data=df, x='PercOAItems', y='CitPerItem_NotOA_norm', hue='Discipline')
axes[0].set_title('Not OA items')
sns.scatterplot(ax=axes[1], data=df, x='PercOAItems', y='CitPerItem_OA_norm', hue='Discipline', legend=False)
axes[1].set_title('OA items')
axes[0].set(yscale="log")
axes[0].set_ylabel('Citations per item')
plt.savefig('../outputs/cit_percOA.png')


In [None]:
df[df.Discipline == 'physics'][['TotItems','OAItems']]

In [None]:
g = sns.scatterplot(data=df, x="PercOAItems", y="CitR_norm", hue="Discipline", palette="deep")
g.set(yscale="log")

In [None]:
sns.scatterplot(data=df, x="PercOAItems", y="CitPerItem_OA_norm", hue="Discipline", palette="deep")

In [None]:
df.loc[df['CitPerItem_OA'] > 30]

In [None]:
sns.scatterplot(data=df, x="PercNotOAItems", y="CitPerItem_NotOA", hue="Discipline", palette="deep")

In [None]:
df[df.OAItems < 1]

In [None]:
sns.set(rc={'figure.figsize':(20,5)})
sns.scatterplot(data=cgab_gdpord, x=cgab_gdpord.index, y='GDP_pc', hue='Group')
t = plt.xticks(rotation=90)
t = plt.title("Countries' GDP per capite and Group classification for APC waivers")
plt.savefig('../outputs/gdp.png')

In [None]:
#df_c = df[countries+['Discipline']].set_index('Discipline', append=True)
df_c = df[countries_gdpord+['Discipline']].set_index('Discipline', append=True)

df_c.index.rename(['Journal', 'Discipline'], inplace=True)
df_c = df_c.reorder_levels(['Discipline', 'Journal']).T
#g = sns.scatterplot(data=df_c)
#g.set_xticklabels(g.get_xticklabels(), rotation=30)
sns.set(rc={'figure.figsize':(20,5)})
sns.scatterplot(data=df_c)
t = plt.xticks(rotation=90)
t = plt.title('Tot. N of authors per Journal (2019), per Country')
plt.savefig('../outputs/n_auth_journal_country.png')

# Tot. N of authors per discipline
#df_c.groupby('Discipline', axis=1).sum()
# Country percentage of authors per journal
#df_c / df_c.sum()

In [None]:
# Normalized N of authors per population
df_c_n = df_c.div(cgab_gdpord.Population, axis=0)
sns.scatterplot(data=df_c_n)
t = plt.xticks(rotation=90)
t = plt.title('Tot. N of authors per Journal (2019), per Country, normalized per population (in M)')
plt.savefig('../outputs/n_auth_journal_country_popnorm.png')


In [None]:
#When working with wide-form data, each column will be plotted against its index using both hue and style mapping
# X == index; Y == columns
#tdf = df_c / df_c.sum() #This divides by tot.N of authors!
tdf = df_c.divide(df.TotItems, level=1) #This divides by tot N of publications
sns.scatterplot(data=tdf)
t = plt.xticks(rotation=90)
t = plt.title('Percentage of authors from a Country wrt the total 2019 publications in a Journal')
plt.savefig('../outputs/pc_auth_journal_country.png')


In [None]:
#tdf_n = df_c_n / df_c.sum()
tdf_n = df_c_n.divide(df.TotItems, level=1) #This divides by tot N of publications
sns.scatterplot(data=tdf_n)
t = plt.xticks(rotation=90)
t = plt.title('Percentage of authors from a Country wrt the total 2019 publications in a Journal, per population (in M)')
plt.savefig('../outputs/pc_auth_journal_country_popnorm.png')


In [None]:
tdf.head()

In [None]:
#tdf2 = df[countries+['Discipline']]
#tdf2.index.rename('Journal', inplace=True)
#tdf2.reset_index().melt(value_vars=[('Journal', 'Discipline')])
tdf_n_A = tdf_n[cgab_gdpord.Group == 'A']
tdf_n_B = tdf_n[cgab_gdpord.Group == 'B']
tdf_n_o = tdf_n[cgab_gdpord.Group=='Other']

for xdf, xl in zip([tdf_n_A, tdf_n_B, tdf_n_o], ['A', 'B', 'Other']):
    tdf2 = pd.melt(xdf.reset_index(), col_level=0, id_vars=['index'])
    tdf2.rename(columns={'index': 'Country'}, inplace=True)
    sns.boxplot(x='Country', y='value', hue='Discipline', data=tdf2)
    t = plt.xticks(rotation=90)
    t = plt.title(xl)
    plt.savefig(f'../outputs/{xl}.png')
    plt.clf()

In [None]:
#tdf_n[cgab_gdpord.Group == 'A']


# 3 x 3 plot

Divide countries as "A", "B", "Other" and Journals as "Full OA", "Not OA", "Hybrid"


In [None]:
# tdf_n == Percentage of authors from a Country wrt the total 2019 publications in a Journal, per population (in M)
tdf1 = pd.melt(tdf_n.reset_index(), id_vars=['index']).rename(columns={'index': 'Country', 'value': 'PercAuthPopNorm'})
# df_c_n == Tot. N of authors per Journal (2019), per Country, normalized per population (in M)
tdf2 = pd.melt(df_c_n.reset_index(), id_vars=['index']).rename(columns={'index': 'Country', 'value': 'NAuthPopNorm'})

tdf2['PercAuthPopNorm'] = tdf1.PercAuthPopNorm

tmp = df.drop(columns=countries).drop(columns=['Discipline']).reset_index()
tmp.rename(columns={'index': 'Journal'}, inplace=True)
mydf = tdf2.set_index('Journal').join(tmp.set_index('Journal'))
mydf = mydf.merge(cgab.Group, left_on='Country', right_index=True).reset_index()

In [None]:
# Check if tdf2['PercAuthPopNorm'] = tdf1.PercAuthPopNorm is legit
c1 = tdf1.Country == tdf2.Country
print(c1.value_counts())

c2 = tdf1.Discipline == tdf2.Discipline
print(c2.value_counts())

c3 = tdf1.Journal == tdf2.Journal
print(c3.value_counts())


In [None]:
def assign_OA_type(row):
    if row.OAItems < 1:
        row.OAType = 'NotOA'
    elif row.NotOAItems < 1:
        row.OAType = 'FullOA'
    else:
        row.OAType = 'Hybrid'
    return row

mydf['OAType'] = 'NA'
mydf = mydf.apply(assign_OA_type, axis='columns')

In [None]:
dis = list(mydf.Discipline.value_counts().index)

fig_n, axes_n = plt.subplots(3, 3, sharex=True, sharey='row', figsize=(15,15))
fig_n.suptitle('Number of Authors per Journal (2019) normalized per population (M)')
fig_p, axes_p = plt.subplots(3, 3, sharex=True, sharey='row', figsize=(15,15))
fig_p.suptitle('Percentage of Authors per Journal (2019) normalized per population (M)')
sns.set(rc={'figure.figsize':(5,5)})

for a1,g in enumerate(['A', 'B', 'Other']):
    for a2,o in enumerate(['NotOA', 'Hybrid', 'FullOA']):
        axes_n[a1][a2].set_title(f'{g} countries, {o} Journals')
        sns.scatterplot(ax=axes_n[a1][a2], data=mydf[(mydf.Group==g) & (mydf.OAType==o)], x='CitPerItem_norm', y='NAuthPopNorm', hue='Discipline', hue_order=dis)
        axes_p[a1][a2].set_title(f'{g} countries, {o} Journals')
        sns.scatterplot(ax=axes_p[a1][a2], data=mydf[(mydf.Group==g) & (mydf.OAType==o)], x='CitPerItem_norm', y='PercAuthPopNorm', hue='Discipline', hue_order=dis)

fig_n.savefig('../outputs/n_authors.png')
fig_p.savefig('../outputs/perc_authors.png')

In [None]:
axes_n

In [None]:
#
#fig_n.suptitle('Number of Authors per Journal (2019) normalized per population (M)')
plt.clf()
for a1,g in enumerate(['A', 'B', 'Other']):
    fig_n, axes_n = plt.subplots(6, 1, sharex=True, figsize=(15,20))
    fig_n.suptitle(f'Number of Authors per Journal (2019) normalized per population (M) in group "{g}" Countries')
    for a2,d in enumerate(dis):
        axes_n[a2].set_title(f'{d}')
        sns.scatterplot(ax=axes_n[a2], data=mydf[(mydf.Group==g) & (mydf.Discipline==d)], x='Country', y='NAuthPopNorm', hue='OAType', hue_order=['NotOA', 'Hybrid', 'FullOA'])
        t = plt.xticks(rotation=90)
    fig_n.savefig(f'../outputs/n_authors_{g}.png')
    plt.clf()


In [None]:
#
#fig_n.suptitle('Number of Authors per Journal (2019) normalized per population (M)')
plt.clf()
for a1,g in enumerate(['A', 'B', 'Other']):
    fig_p, axes_p = plt.subplots(6, 1, sharex=True, figsize=(15,20))
    fig_p.suptitle(f'Percentage of Authors per Journal (2019) normalized per population (M) in group "{g}" Countries')
    for a2,d in enumerate(dis):
        axes_p[a2].set_title(f'{d}')
        sns.scatterplot(ax=axes_p[a2], data=mydf[(mydf.Group==g) & (mydf.Discipline==d)], x='Country', y='PercAuthPopNorm', hue='OAType', hue_order=['NotOA', 'Hybrid', 'FullOA'])
        t = plt.xticks(rotation=90)
    fig_p.savefig(f'../outputs/pc_authors_{g}.png')
    plt.clf()


In [None]:
sns.set(rc={'figure.figsize':(10,5)})
for a1,g in enumerate(['A', 'B', 'Other']):
    for a2,d in enumerate(dis):
        sns.scatterplot(data=mydf[(mydf.Group==g) & (mydf.Discipline==d)], x='Country', y='PercAuthPopNorm', hue='OAType', hue_order=['NotOA', 'Hybrid', 'FullOA'])
        t = plt.xticks(rotation=90)
        t = plt.title(f'Percentage of Authors per Journal (2019) in {d}, normalized per population (M) from group "{g}" Countries')
        plt.savefig(f'../outputs/pc_authors_{g}_{d}.png')
        plt.clf()


In [None]:
fig_n, axes_n = plt.subplots(3, 1, sharex=True, sharey=True, figsize=(20,15))
for a,o in enumerate(['NotOA', 'Hybrid', 'FullOA']):
    axes_n[a].set_title(f'{o} Journals')
    sns.scatterplot(ax=axes_n[a], data=mydf[mydf.OAType==o], x='Country', y='NAuthPopNorm', hue='Discipline')
    t = plt.xticks(rotation=90)

In [None]:
sns.scatterplot(data=mydf, x='Country', y='NAuthPopNorm', hue='OAType')
t = plt.xticks(rotation=90)


In [None]:

sns.scatterplot(data=mydf[mydf.Group=='A'], x='PercOAItems', y='PercAuthPopNorm', hue='Discipline')

In [None]:
sns.scatterplot(data=mydf[mydf.Group=='B'], x='PercOAItems', y='PercAuthPopNorm', hue='Discipline')

In [None]:
sns.scatterplot(data=mydf[mydf.Group=='Other'], x='PercOAItems', y='PercAuthPopNorm', hue='Discipline')

In [None]:
tdfs = []
for d in tdf.columns.levels[0]:
    tdfs.append(tdf.xs(d, level=0, axis=1))


#tdf2 = df[countries+['Discipline']]
#

In [None]:
sns.relplot(data=tdfs, kind="line")
#tdfs[0]

In [None]:
df_c.sum()