# Day 2 - Familiarise with scholarly data

## Import the libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

## Load conference data

In [None]:
iswc = pd.read_csv('../data/raw/iswc_enhanced.csv', dtype=object)
iswc['author_order'] = pd.to_numeric(iswc['author_order'])

eswc = pd.read_csv('../data/raw/eswc_enhanced.csv', dtype=object)
eswc['author_order'] = pd.to_numeric(eswc['author_order'])

tpdl = pd.read_csv('../data/raw/tpdl_enhanced.csv', dtype=object)
tpdl['author_order'] = pd.to_numeric(tpdl['author_order'])


In [None]:
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25010-6', 'conf_city'] = 'Bethlehem'
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25010-6', 'conf_country'] = 'United States'

iswc.loc[iswc.book_doi == '10.1007/978-3-319-25007-6', 'conf_city'] = 'Bethlehem'
iswc.loc[iswc.book_doi == '10.1007/978-3-319-25007-6', 'conf_country'] = 'United States'

iswc.loc[iswc.book_doi == '10.1007/978-3-540-76298-0', 'conf_city'] = 'Busan'
iswc.loc[iswc.book_doi == '10.1007/978-3-540-76298-0', 'conf_country'] = 'South Korea'

iswc.loc[iswc.conf_country == 'USA', 'conf_country'] = 'United States'

In [None]:
iswc['conf_acronym'].unique()

In [None]:
iswc[iswc.conf_acronym == 'SWSWPC'].head()

In [None]:
iswc[iswc.conf_acronym == 'SWDB'].head()

In [None]:
iswc.describe(include='all')

In [None]:
eswc['conf_acronym'].unique()

In [None]:
eswc[eswc.conf_acronym == 'ESWS'].head(1)['conf_name']

In [None]:
eswc[eswc.conf_name == 'International Semantic Web Conference'].head()

In [None]:
eswc.describe(include='all')

In [None]:
tpdl['conf_acronym'].unique()

In [None]:
tpdl.loc[tpdl.year == '2014-01-01', 'year'] = '2013-01-01'

In [None]:
tpdl.describe(include='all')

## Analysis

For the sake of simplicity, we can create a variable and assign one of our dataframes `iswc`, `eswc`, `tpdl`.

Pick one for the moment. Later, you can come up here, switch to another dataframe and run again the cells below.

In [None]:
df = iswc

In [None]:
df_first = df[df.author_order == 1]
df_last_authors_idx = df.groupby('paper_doi')['author_order'].transform(max) == df['author_order']
df_last = df[df_last_authors_idx]
df_middle = df[df_last_authors_idx.map(lambda x: not x)]
df_middle = df_middle[df_middle.author_order != 1]

In [None]:
df[df.paper_doi == '10.1007/11926078_12']

In [None]:
df.book_confSubtitle.unique()

**Task:** which countries the conference visited?

In [None]:
df.conf_country.unique()

**Task:** any country multiple times? Can you do a bar plot?

In [None]:
df[['conf_country', 'year']].drop_duplicates().groupby('conf_country').count().plot.bar()

**Task:** which cities?

In [None]:
df.conf_city.unique()

**Task:** Find the top-10 researcher by number of papers published

In [None]:
df[['author_familyName', 'author_givenName', 'paper_doi']]\
        .groupby(['author_familyName', 'author_givenName'])\
        .count()\
        .sort_values(by=['paper_doi'], ascending=False).reset_index()\
        .head(10)

**Task:** Find the top-10 researcher by number of papers published (in last position)

**Task:** Find the top-10 researcher by number of papers published (in first position)

**Task:** are there notable inversions?

**Task:** Find and plot the total number of contributions per year

In [None]:
df.groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi')
# Same as the following
# df.groupby(['year'])['paper_doi'].count().plot.line(y='paper_doi')
# df.groupby(['year'])['paper_doi'].count().plot.line()

**Task:** Find and plot the same just with grid.id attached (i.e., with an orgnanisation explicitly attached)

In [None]:
df[df['org_gridId'].notna()].groupby(['year'])['paper_doi'].count().plot.line(y='paper_doi')

**Task:** Find and plot the same just without grid.id attached

In [None]:
df[df['org_gridId'].isna()].groupby(['year'])['paper_doi'].count().plot.line()

For clarity, these can go in the same plot.

In [None]:
fig, ax = plt.subplots()

df.groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='total')
df[df['org_gridId'].notna()].groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='w/ grid.id')
df[df['org_gridId'].isna()].groupby(['year'])['paper_doi'].count().reset_index().plot.line(x='year', y='paper_doi', ax=ax, label='w/o grid.id')

In [None]:
contributions = df.groupby(['year'])['paper_doi'].count()
contributions_with_gridId = df[df['org_gridId'].notna()].groupby(['year'])['paper_doi'].count()
contributions_without_gridId = df[df['org_gridId'].isna()].groupby(['year'])['paper_doi'].count()

trace1 = go.Scatter(x=contributions.index,
                    y=contributions.values,
                    name='contributions',
                    marker=dict(symbol='circle', size=8),
                    mode='lines+markers')
                    
trace2 = go.Scatter(x=contributions_with_gridId.index,
                    y=contributions_with_gridId.values,
                    name='contributions (w/ gridID)',
                    marker=dict(symbol='cross', size=8),
                    mode='lines+markers')

trace3 = go.Scatter(x=contributions_without_gridId.index,
                    y=contributions_without_gridId.values,
                    name='contributions (w/o gridID)',
                    marker=dict(symbol='triangle-up', size=8),
                    mode='lines+markers')
                    
layout = go.Layout(title='Trends of contributions',
                   legend=dict(orientation='h'))

fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
plotly.offline.iplot(fig)

**Task:** Find and plot the trend of PIDs (i.e., DOIs and grid.ids) per year

**Task:** Find and plot the distribution of papers by country per year

In [None]:
TOP_N = 30
dois = df.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
data = [
    go.Bar(
        x=dois.sort_values(by=['paper_doi'], ascending=False)['org_countrycode'],
        y=dois.sort_values(by=['paper_doi'], ascending=False)['paper_doi']
    )
]
layout = go.Layout(
    title='Paper distribution by country',
    xaxis=dict(tickangle=45,tickfont=dict(size=12), range=[-.5, TOP_N - 1 + .5])
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
dois = df.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_first = df_first.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_last = df_last.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()

bar_1 = go.Bar(y=dois['paper_doi'],
              x=dois['org_countrycode'],
              name='Overall')

bar_2 = go.Bar(y=dois_first['paper_doi'],
              x=dois_first['org_countrycode'],
              name='First')

bar_3 = go.Bar(y=dois_last['paper_doi'],
              x=dois_last['org_countrycode'],
              name='Last')

layout = go.Layout(title='DOI breakdown by country',
                   legend=dict(orientation='h'))

fig = go.Figure(data=[bar_1,bar_2,bar_3], layout=layout)
plotly.offline.iplot(fig)

We can also plot as a pie chart

In [None]:
dois = df.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_first = df_first.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()
dois_last = df_last.groupby(['org_countrycode'])['paper_doi'].nunique().reset_index()

pie_1 = go.Pie(values=dois['paper_doi'],
              labels=dois['org_countrycode'],
              textposition='inside',
              domain=dict(x=[0, 1], y=[.4, 1]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

pie_2 = go.Pie(values=dois_first['paper_doi'],
              labels=dois_first['org_countrycode'],
              textposition='inside',
              domain=dict(x=[0, .5], y=[0, .6]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

pie_3 = go.Pie(values=dois_last['paper_doi'],
              labels=dois_last['org_countrycode'],
              textposition='inside',
              domain=dict(x=[.5, 1], y=[0, .6]),
              name='DOIs',
              hoverinfo='label+value+name',
              hole=.4)

tris_donut = go.Layout(title='DOI breakdown by country',
                       annotations=[dict(font=dict(size=15),
                                        showarrow=False,
                                        text='All authors',
                                        x=.5, y=1.07),
                                    dict(font=dict(size=15),
                                        showarrow=False,
                                        text='1st authors',
                                        x=.18, y=-0.07),
                                    dict(font=dict(size=15),
                                        showarrow=False,
                                        text='Last authors',
                                        x=.83, y=-0.07)])

fig = go.Figure(data=[pie_1,pie_2,pie_3], layout=tris_donut)
plotly.offline.iplot(fig)

**Task:** Can you do the same plots aggregating by organisations?