# Set Up

In [1]:
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go


init_notebook_mode(connected=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

# Loading Data

In [2]:
downloads_req_df = pd.read_json('data/download_requests.json')
topics_req_df = pd.read_json('data/topic_requests.json')

In [3]:
topics_req_df.head()

Unnamed: 0,city,country,exists,format,id,lat,lon,time,topic,user
0,,United Kingdom,1,html,1,51.4964,-0.1224,2018-07-20 12:53:30,computer science,1
1,,Netherlands,1,html,2,52.3824,4.8995,2018-07-20 13:41:17,computer science,2
10,,,1,html,11,,,2018-07-20 18:10:27,computer science,3
100,,United Kingdom,1,html,101,51.4964,-0.1224,2018-07-25 18:08:09,computer science,1
1000,Wellingborough,United Kingdom,1,html,1001,52.3027,-0.6945,2018-09-15 11:09:51,binocular,4


In [4]:
downloads_req_df.head()

Unnamed: 0,city,country,format,host,id,lat,lon,time,user,user_email,version
0,,United Kingdom,nt,open-webservice-csvip.open.ac.uk,1,51.4964,-0.1224,2018-10-29 18:02:57,137.108.200.90,angelo.salatino@open.ac.uk,3.0
1,,United Kingdom,owl,open-webservice-csvip.open.ac.uk,2,51.4964,-0.1224,2018-10-29 18:03:00,137.108.200.90,angelo.salatino@open.ac.uk,3.0
10,Ho Chi Minh City,Vietnam,csv,unknown,11,10.8142,106.6438,2018-11-06 17:29:40,27.74.254.26,,3.0
100,Leeds,United Kingdom,csv,cpc95608-seac27-2-0-cust957.7-2.cable.virginm.net,101,53.8,-1.5833,2019-01-16 18:27:43,80.2.63.190,,3.0
101,,United Kingdom,nt,ad96e3e4a.dsl.de.colt.net,102,51.4964,-0.1224,2019-01-18 08:35:49,62.96.11.74,,3.0


# Analysis of CSO downloads

In [5]:
versions = downloads_req_df.groupby('version')[['id']].nunique()

pie = go.Pie(values=versions.id,
              labels=versions.index,
              textposition='inside',
              name='Version',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO downloads version fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)

In [6]:
formats = downloads_req_df.groupby('format')[['id']].nunique()

pie = go.Pie(values=formats.id,
              labels=formats.index,
              textposition='inside',
              name='Format',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO downloads format fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)

In [7]:
download_data = downloads_req_df.groupby(['country'])['id'].count().reset_index()
downloads_map = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = download_data['country'],
        z = download_data['id'],
        text = download_data['id'],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            title = '#downloads'),
      ) ]

layout = dict(
    title = 'Download distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=downloads_map, layout=layout)
plotly.offline.iplot(fig, validate=False)

In [8]:
download_data.sort_values(by='id', ascending=False).head(n=10)

Unnamed: 0,country,id
44,United States,70
43,United Kingdom,64
21,Italy,33
17,India,30
14,Germany,26
13,France,21
10,China,18
9,Canada,15
36,Spain,12
7,Brazil,11


# Analysis of CSO topic requests

In [9]:
formats = topics_req_df.groupby('format')[['id']].nunique()

pie = go.Pie(values=formats.id,
              labels=formats.index,
              textposition='inside',
              name='Format',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO topic requests format fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)

In [10]:
hits = topics_req_df.groupby('exists')[['id']].nunique()

pie = go.Pie(values=hits.id,
              labels=hits.index,
              textposition='inside',
              name='Hit',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO topics requests hits')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)

In [11]:
requests_data = topics_req_df.groupby(['lat', 'lon'])['id'].count().reset_index()
users_data = topics_req_df.groupby(['country'])['user'].nunique().reset_index()

user_map = dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = users_data['country'],
        z = users_data['user'],
        text = users_data['user'],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            title = '#unique users'),
      )

bubbles = dict(
    type = 'scattergeo',
    lon = requests_data['lon'],
    lat = requests_data['lat'],
    text = requests_data['id'],
    marker = dict(
        size = requests_data['id']/10,
        line = dict(width=0.5, color='rgb(40,40,40)'),
        sizemode = 'area'
    ))

layout = dict(
        title = 'Unique user and topic requests',
        showlegend = False,
        geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
    )

fig = dict(data=[user_map, bubbles], layout=layout)
plotly.offline.iplot(fig, validate=False)

In [12]:
bars_data = topics_req_df[topics_req_df['exists'] == 1]
bars_data = bars_data.groupby('topic')[['id']].count().sort_values(by='id', ascending=False).reset_index()
bars_data = bars_data[0:50]

trace0 = go.Bar(
    x = bars_data['topic'],
    y = bars_data['id'])

layout = go.Layout(title='Top-50 topics (hits)',
                   xaxis=dict(tickangle=-45,
                             automargin=True,
                             tickfont=dict(size=9)),
                   yaxis=dict(title='# requests',
                              type='log'))
                   
fig = go.Figure(data=[trace0], layout=layout)
plotly.offline.iplot(fig)

In [13]:
bars_data = topics_req_df[topics_req_df.exists == 0].groupby('topic')[['id']].count().sort_values(by='id', ascending=False).reset_index()
bars_data = bars_data[0:50]

trace0 = go.Bar(
    x = bars_data['topic'],
    y = bars_data['id'],
    marker = dict(color='#cf0a5f'))

layout = go.Layout(title='Top-50 requested but non existing topics (hits)',
                   xaxis=dict(tickangle=-45,
                             automargin=True,
                             tickfont=dict(size=9)),
                   yaxis=dict(title='# requests',
                              type='log'))

fig = go.Figure(data=[trace0], layout=layout)
plotly.offline.iplot(fig)