# Data analysis and visualization

Code for analysis / visualization


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

df = pd.read_csv('../data/inCites/dataframe.csv', index_col=0)
cgab = pd.read_csv('../data/inCites/countries_groupAB.csv', index_col=0)
countries = list(cgab.index)
#df_c = df[countries]

df['PercOAItems'] = df.OAItems / df.TotItems
df['PercNotOAItems'] = df.NotOAItems / df.TotItems
df['CitR'] = df.CitPerItem_OA/ df.CitPerItem_NotOA
df['CitPerItem'] = df.TotCitations / df.TotItems
df['AvgCit'] = df.groupby('Discipline').TotCitations.transform('mean')
df['AvgCitPerItem'] = df.groupby('Discipline').CitPerItem.transform('mean')

df_citnorm = df[['TotCitations',  'OACitations', 'NotOACitations']].div(df.AvgCit, axis=0).add_suffix('_norm')
df_cpinorm = df[['CitPerItem_OA', 'CitPerItem_NotOA', 'CitPerItem']].div(df.AvgCitPerItem, axis=0).add_suffix('_norm')
df = pd.concat([df, df_citnorm, df_cpinorm], axis=1)
df['CitR_norm'] = df.CitPerItem_OA_norm/ df.CitPerItem_NotOA_norm

# Population in Millions
cgab.Population = cgab.Population / 1000000

In [None]:
list(df.columns)

In [None]:
# Does the N. of OA option trend with a higher citation rate OA/notOA ?

#sns.scatterplot(data=df, x="PercOAItems", y="CitR", hue="Discipline", palette="deep")
#avg_fullOA = df[df.NotOAItems < 1].groupby('Discipline').CitPerItem_OA.mean()
#avg_notOA = df[df.OAItems < 1].groupby('Discipline').CitPerItem_NotOA.mean()
#pd.concat([avg_fullOA, avg_notOA], axis=1).reset_index().rename(columns={'CitPerItem_OA': 'Full OA', 'CitPerItem_NotOA': 'No OA'})
sns.scatterplot(data=df, x="PercOAItems", y="CitPerItem_norm", hue="Discipline", palette="deep")
t = plt.title("Citations per article vs Journal's percentage of OA articles, normalized by average citations per item in the discipline")
plt.savefig('../outputs/cit-per-item_OA-perc_norm.png')

In [None]:
ax = sns.boxplot(data=df, x='PercOAItems', y='Discipline', palette="Set3")
ax = sns.swarmplot(data=df, x='PercOAItems', y='Discipline', color='k', size=4)
t = plt.title('Percentage of OA items')
plt.savefig('../outputs/pct_OA_discipline.png')

In [None]:
df[df.Discipline == 'physics'][['TotItems','OAItems']]

In [None]:
g = sns.scatterplot(data=df, x="PercOAItems", y="CitR_norm", hue="Discipline", palette="deep")
g.set(yscale="log")

In [None]:
sns.scatterplot(data=df, x="PercOAItems", y="CitPerItem_OA_norm", hue="Discipline", palette="deep")

In [None]:
df.loc[df['CitPerItem_OA'] > 30]

In [None]:
sns.scatterplot(data=df, x="PercNotOAItems", y="CitPerItem_NotOA", hue="Discipline", palette="deep")

In [None]:
df[df.OAItems < 1]

In [None]:
df_c = df[countries+['Discipline']].set_index('Discipline', append=True)
df_c.index.rename(['Journal', 'Discipline'], inplace=True)
df_c = df_c.reorder_levels(['Discipline', 'Journal']).T
#g = sns.scatterplot(data=df_c)
#g.set_xticklabels(g.get_xticklabels(), rotation=30)
sns.set(rc={'figure.figsize':(20,5)})
sns.scatterplot(data=df_c)
t = plt.xticks(rotation=90)
t = plt.title('Tot. N of authors per Journal (2019), per Country')
plt.savefig('../outputs/n_auth_journal_country.png')

# Tot. N of authors per discipline
#df_c.groupby('Discipline', axis=1).sum()
# Country percentage of authors per journal
#df_c / df_c.sum()

In [None]:
# Normalized N of authors per population
df_c_n = df_c.div(cgab.Population, axis=0)
sns.scatterplot(data=df_c_n)
t = plt.xticks(rotation=90)
t = plt.title('Tot. N of authors per Journal (2019), per Country, normalized per population (in M)')
plt.savefig('../outputs/n_auth_journal_country_popnorm.png')


In [None]:
#When working with wide-form data, each column will be plotted against its index using both hue and style mapping
# X == index; Y == columns
#tdf = df_c / df_c.sum() #This divides by tot.N of authors!
tdf = df_c.divide(df.TotItems, level=1) #This divides by tot N of publications
sns.scatterplot(data=tdf)
t = plt.xticks(rotation=90)
t = plt.title('Percentage of authors from a Country wrt the total 2019 publications in a Journal')
plt.savefig('../outputs/pc_auth_journal_country.png')


In [None]:
#tdf_n = df_c_n / df_c.sum()
tdf_n = df_c_n.divide(df.TotItems, level=1) #This divides by tot N of publications
sns.scatterplot(data=tdf_n)
t = plt.xticks(rotation=90)
t = plt.title('Percentage of authors from a Country wrt the total 2019 publications in a Journal, per population (in M)')
plt.savefig('../outputs/pc_auth_journal_country_popnorm.png')


In [None]:
tdf.head()

In [None]:
#tdf2 = df[countries+['Discipline']]
#tdf2.index.rename('Journal', inplace=True)
#tdf2.reset_index().melt(value_vars=[('Journal', 'Discipline')])
tdf_n_A = tdf_n.loc[cgab.GroupA > 0]
tdf_n_B = tdf_n.loc[cgab.GroupB > 0]
tdf_n_o = tdf_n.loc[(cgab.GroupA < 1) & (cgab.GroupB < 1)]


# Group A / B not correct - check e.g. Gaza (name mismatch to fix)

for xdf, xl in zip([tdf_n_A, tdf_n_B, tdf_n_o], ['A', 'B', 'other']):
    tdf2 = pd.melt(xdf.reset_index(), col_level=0, id_vars=['index'])
    tdf2.rename(columns={'index': 'Country'}, inplace=True)
    sns.boxplot(x='Country', y='value', hue='Discipline', data=tdf2)
    t = plt.xticks(rotation=90)
    t = plt.title(xl)
    plt.savefig(f'../outputs/{xl}.png')
    plt.clf()

# TO DO
complete melt with OA info for correlation plots

In [None]:
tdf2 = pd.melt(tdf_n.reset_index(), id_vars=['index'])#, value_vars=['Journal'])

In [None]:
tdf2.head()

In [None]:
tdf.columns

In [None]:
tdfs = []
for d in tdf.columns.levels[0]:
    tdfs.append(tdf.xs(d, level=0, axis=1))


#tdf2 = df[countries+['Discipline']]
#

In [None]:
sns.relplot(data=tdfs, kind="line")
#tdfs[0]

In [None]:
df_c.sum()