# Day 2 - Familiarise with scholarly data

## Import the libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

## Load conference data

The datasets used here comes from the paper https://content.iospress.com/articles/data-science/ds190015 and are about three conferences the International Semantic Web Conference (ISWC), The European Conference on Semantic Web (ESWC), and the International Conference on Theory and Practice of Digital Libraries (TPDL).

These have been extracted from Springer Nature Scigraph (apparently the website does not work anymore, https://www.springernature.com/gp/researchers/scigraph).

In SciGraph they were assigned conference series identifiers from DBLP (i.e., semweb, esws and ercimdl, respectively), so I extracted them from a bigger dataset thanks to these keywords.

Let's load them directly.


In [None]:
iswc = pd.read_csv('../data/raw/iswc_enhanced.csv', dtype=object)
iswc['author_order'] = pd.to_numeric(iswc['author_order'])

eswc = pd.read_csv('../data/raw/eswc_enhanced.csv', dtype=object)
eswc['author_order'] = pd.to_numeric(eswc['author_order'])

tpdl = pd.read_csv('../data/raw/tpdl_enhanced.csv', dtype=object)
tpdl['author_order'] = pd.to_numeric(tpdl['author_order'])


Let me fix the data for you first

In [None]:
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25010-6', 'conf_city'] = 'Bethlehem'
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25010-6', 'conf_country'] = 'United States'

iswc.loc[iswc.book_doi == '10.1007/978-3-319-25007-6', 'conf_city'] = 'Bethlehem'
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25007-6', 'conf_country'] = 'United States'

iswc.loc[iswc.book_doi == '10.1007/978-3-540-76298-0', 'conf_city'] = 'Busan'
iswc.loc[iswc.book_doi == '10.1007/978-3-540-76298-0', 'conf_country'] = 'South Korea'

iswc.loc[iswc.conf_country == 'USA', 'conf_country'] = 'United States'

All the dataframes follow the same structure. Check the columns.

Check how a generic record looks like

OK. Let's see ISWC first

In [None]:
iswc.describe(include='all')

Ok, let's double check this conference series.

In [None]:
iswc['conf_acronym'].unique()

Ah... it looks like there are different acronyms here. Check if they are indeed the conference you need.

In [None]:
iswc[iswc.conf_acronym == 'SWSWPC'].head(1)['conf_name'].values

Check the other as well.

Ok, we have intruders here. Let's drop them.

In [None]:
iswc = iswc.drop(iswc[iswc.conf_acronym != 'ISWC'].index)

In [None]:
iswc.describe(include='all')

Onto ESWC now

In [None]:
eswc['conf_acronym'].unique()

Check the other acronym just to be sure

Ah! That's ok! This was the name of the conference before it was a conference.

In [None]:
eswc.describe(include='all')

Finally, let's check TPDL

In [None]:
tpdl['conf_acronym'].unique()

Check ECDL 

Let me fix this date for you. I know it was wrong.

In [None]:
tpdl.loc[tpdl.year == '2014-01-01', 'year'] = '2013-01-01'

In [None]:
tpdl.describe(include='all')

## Analysis

For the sake of simplicity, we can create a variable `df`and assign one of our dataframes `iswc`, `eswc`, `tpdl` to it.

Pick one for the moment. Later, you can come up here, switch to another dataframe and run again the cells below.

In [None]:
df = iswc

In [None]:
df_first = df[df.author_order == 1]
df_last_authors_idx = df.groupby('paper_doi')['author_order'].transform(max) == df['author_order']
df_last = df[df_last_authors_idx]
df_middle = df[df_last_authors_idx.map(lambda x: not x)]
df_middle = df_middle[df_middle.author_order != 1]

In [None]:
df[df.paper_doi == '10.1007/11926078_12']

In [None]:
df.book_confSubtitle.unique()

**Task:** which countries the conference visited?

**Task:** any country multiple times? Can you do a bar plot?

**Task:** which cities?

**Task:** Find the top-10 researcher by number of papers published

**Task:** Find the top-10 researcher by number of papers published (in last position)

**Task:** Find the top-10 researcher by number of papers published (in first position)

Do you see any notable inversion?

**Task:** Find and plot the total number of contributions per year

**Task:** Find and plot the same just with grid.id attached (i.e., with an organisation explicitly attached)

**Task:** Find and plot the same just without grid.id attached

For clarity, these can go in the same plot.

In [None]:
fig, ax = plt.subplots()

df.groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='total')
df[df['org_gridId'].notna()].groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='w/ grid.id')
df[df['org_gridId'].isna()].groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='w/o grid.id')

In plotly, to have an interactive plot, it would be like this.

In [None]:
contributions = df.groupby(['year'])['paper_doi'].count()
contributions_with_gridId = df[df['org_gridId'].notna()].groupby(['year'])['paper_doi'].count()
contributions_without_gridId = df[df['org_gridId'].isna()].groupby(['year'])['paper_doi'].count()

trace1 = go.Scatter(x=contributions.index,
                    y=contributions.values,
                    name='contributions',
                    marker=dict(symbol='circle', size=8),
                    mode='lines+markers')
                    
trace2 = go.Scatter(x=contributions_with_gridId.index,
                    y=contributions_with_gridId.values,
                    name='contributions (w/ gridID)',
                    marker=dict(symbol='cross', size=8),
                    mode='lines+markers')

trace3 = go.Scatter(x=contributions_without_gridId.index,
                    y=contributions_without_gridId.values,
                    name='contributions (w/o gridID)',
                    marker=dict(symbol='triangle-up', size=8),
                    mode='lines+markers')
                    
layout = go.Layout(title='Trends of contributions',
                   legend=dict(orientation='h'))

fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
plotly.offline.iplot(fig)

**Task:** Find and plot the trend of PIDs (i.e., DOIs and grid.ids) per year

**Task:** Find and plot the distribution of papers by country

**Task:** Find and plot the number of papers by country (overall, first authors, last authors). 

We can also plot as a pie chart

In [None]:
dois = df.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_first = df_first.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_last = df_last.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()

pie_1 = go.Pie(values=dois['paper_doi'],
              labels=dois['org_countrycode'],
              textposition='inside',
              domain=dict(x=[0, 1], y=[.4, 1]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

pie_2 = go.Pie(values=dois_first['paper_doi'],
              labels=dois_first['org_countrycode'],
              textposition='inside',
              domain=dict(x=[0, .5], y=[0, .6]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

pie_3 = go.Pie(values=dois_last['paper_doi'],
              labels=dois_last['org_countrycode'],
              textposition='inside',
              domain=dict(x=[.5, 1], y=[0, .6]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

tris_donut = go.Layout(title='DOI breakdown by country',
                       annotations=[dict(font=dict(size=15),
                                        showarrow=False,
                                        text='All authors',
                                        x=.5, y=1.07),
                                    dict(font=dict(size=15),
                                        showarrow=False,
                                        text='1st authors',
                                        x=.18, y=-0.07),
                                    dict(font=dict(size=15),
                                        showarrow=False,
                                        text='Last authors',
                                        x=.83, y=-0.07)])

fig = go.Figure(data=[pie_1,pie_2,pie_3], layout=tris_donut)
plotly.offline.iplot(fig)

**Task:** Can you do the same plots above aggregating by organisations this time?

**Task:** Are there institutions that never appear in first or last position? Check year by year.

Hint: `df_first.groupby(['year']).agg({'org_gridId': set})` aggregates a set of organizations