# Data preparation

Run the following once! It just fetches additional data for geolocating IP addresses.

In [3]:
# !curl https://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz > ./data/geoip.dat.gz
# !gunzip ./data/geoip.dat.gz

In [1]:
import pandas as pd
import numpy as np
import socket
from dns import reversename, resolver

import pygeoip
import pycountry_convert as pycountry

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.io as pio

init_notebook_mode(connected=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

Let's define a function for IP resolution

In [2]:
rawdata = pygeoip.GeoIP('./data/geoip.dat')

def ipquery(ip):
    data = rawdata.record_by_name(ip)
    if data is not None:
        country = data['country_name']
        city = data['city']
        lon = data['longitude']
        lat = data['latitude']
        return [city, country, lat, lon]
    else:
        return [None, None, None, None]

In [3]:
def reverse_dns(ip):
    rev_name = reversename.from_address(ip)
    return str(resolver.query(rev_name,"PTR")[0])

def reverse_dns2(ip):
    try:
        return str(socket.gethostbyaddr(ip)[0])
    except:
        return 'unknown'

Loading dataframes

In [4]:
lst = pd.read_json('./data/log_topic_request.json').loc[2]['data']
topics_req_df = pd.DataFrame(lst)
topics_req_df.head(n=5)

Unnamed: 0,exists,format,id,time,topic,user
0,1,html,1,2018-07-20 12:53:30,computer science,angelo.salatino@open.ac.uk
1,1,html,2,2018-07-20 13:01:17,evaluation index system,5.45.207.68
2,1,html,3,2018-07-20 13:15:16,alpha phellandrene,66.249.79.98
3,1,html,4,2018-07-20 13:41:17,computer science,195.128.10.106
4,1,html,5,2018-07-20 13:44:44,microprocessor chips,195.128.10.106


In [5]:
topics_req_df.shape[0]

92654

In [6]:
lst = pd.read_json('./data/log_download_request.json').loc[2]['data']
downloads_req_df = pd.DataFrame(lst)
downloads_req_df.head(n=5)

Unnamed: 0,format,id,time,user,version
0,nt,1,2018-10-29 18:02:57,angelo.salatino@open.ac.uk,3.0
1,owl,2,2018-10-29 18:03:00,angelo.salatino@open.ac.uk,3.0
2,csv,3,2018-10-29 18:03:01,angelo.salatino@open.ac.uk,3.0
3,csv,4,2018-10-29 18:03:26,137.108.66.104,3.0
4,owl,5,2018-10-29 18:03:29,137.108.66.104,3.0


In [7]:
downloads_req_df.shape[0]

115

We have to patch email addresses (i.e. _user_ column); let's make them point to relevant IP address 
(these users are mainly us from KMi (**137.108.113.93**) or from Germany (Alex))

In [8]:
topics_req_df.loc[topics_req_df.user.str.contains('@')]['user'].unique()

array(['angelo.salatino@open.ac.uk', 'cso.skm3@gmail.com',
       'aliaksandr.birukou@springer.com', 'enrico.motta@open.ac.uk',
       'nina4ever27@gmail.com', 'rakeshgoad1999piplai@gmail.com',
       'danilodessi92@gmail.com', 'francesco.osborne@open.ac.uk',
       'martin.hlosta@open.ac.uk', 'matteo.cancellieri@open.ac.uk',
       'hakan.kiziloz@open.ac.uk', 'enrico.daga@open.ac.uk',
       'dasha.herrmannova@open.ac.uk', 'andrea.mannocci@open.ac.uk',
       'corrado.mencar@uniba.it'], dtype=object)

In [9]:
topics_req_df.loc[topics_req_df.user.str.contains('@open.ac.uk'), 'user'] = '137.108.200.90' # UK
topics_req_df.loc[topics_req_df.user.str.contains('aliaksandr.birukou'), 'user'] = '129.206.13.27' # DE
topics_req_df.loc[topics_req_df.user.str.contains('skm3'), 'user'] = '137.108.200.90'
topics_req_df.loc[topics_req_df.user.str.contains('danilodessi92@gmail.com'), 'user'] = '137.108.200.90'
topics_req_df.loc[topics_req_df.user.str.contains('nina4ever27'), 'user'] = '178.253.95.9' # Syria
topics_req_df.loc[topics_req_df.user.str.contains('rakeshgoad1999piplai'), 'user'] = '64.236.16.116' # US
topics_req_df.loc[topics_req_df.user.str.contains('uniba.it'), 'user'] = '192.167.60.25' # IT
topics_req_df.head()

Unnamed: 0,exists,format,id,time,topic,user
0,1,html,1,2018-07-20 12:53:30,computer science,137.108.200.90
1,1,html,2,2018-07-20 13:01:17,evaluation index system,5.45.207.68
2,1,html,3,2018-07-20 13:15:16,alpha phellandrene,66.249.79.98
3,1,html,4,2018-07-20 13:41:17,computer science,195.128.10.106
4,1,html,5,2018-07-20 13:44:44,microprocessor chips,195.128.10.106


In [10]:
downloads_req_df.loc[downloads_req_df.user.str.contains('@')]['user'].unique()

array(['angelo.salatino@open.ac.uk'], dtype=object)

In [11]:
downloads_req_df.loc[downloads_req_df.user.str.contains('@open.ac.uk'), 'user'] = '137.108.200.90'
downloads_req_df.head()

Unnamed: 0,format,id,time,user,version
0,nt,1,2018-10-29 18:02:57,137.108.200.90,3.0
1,owl,2,2018-10-29 18:03:00,137.108.200.90,3.0
2,csv,3,2018-10-29 18:03:01,137.108.200.90,3.0
3,csv,4,2018-10-29 18:03:26,137.108.66.104,3.0
4,owl,5,2018-10-29 18:03:29,137.108.66.104,3.0


Done!   
Now, let's create 4 new columns with the IP geolocation

In [None]:
ip_info = pd.concat([topics_req_df[['user']], downloads_req_df[['user']]]).drop_duplicates()
ip_info['host'] = ip_info['user'].map(lambda ip: reverse_dns2(ip))
ip_info[['city', 'country', 'lat', 'lon']] = ip_info.apply(lambda row: pd.Series(ipquery(row['user'])), axis=1)
ip_info.head()

In [None]:
topics_req_df = pd.merge(topics_req_df, ip_info, left_on='user', right_on='user', how='left')
topics_req_df.head()

In [None]:
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('yandex')].index, inplace=True)
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('google')].index, inplace=True)
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('bot')].index, inplace=True)
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('crawl')].index, inplace=True)
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('yahoo')].index, inplace=True)
topics_req_df.drop(topics_req_df[topics_req_df.host.str.contains('iparadigms')].index, inplace=True)

This leaves us with

In [None]:
topics_req_df.describe(include='all')

In [None]:
downloads_req_df = pd.merge(downloads_req_df, ip_info, left_on='user', right_on='user', how='left')
downloads_req_df.head(n=5)

In [None]:
downloads_req_df.describe(include='all')

# Analysis of CSO downloads

In [None]:
versions = downloads_req_df.groupby('version')[['id']].nunique()

pie = go.Pie(values=versions.id,
              labels=versions.index,
              textposition='inside',
              name='Version',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO downloads version fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)
pio.write_image(fig, 'images/download_versions.png')

In [None]:
formats = downloads_req_df.groupby('format')[['id']].nunique()

pie = go.Pie(values=formats.id,
              labels=formats.index,
              textposition='inside',
              name='Format',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO downloads format fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)
pio.write_image(fig, 'images/download_formats.png')

In [None]:
download_data = downloads_req_df.groupby(['country'])['id'].count().reset_index()
downloads_map = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = download_data['country'],
        z = download_data['id'],
        text = download_data['id'],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            title = '#downloads'),
      ) ]

layout = dict(
    title = 'Download distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=downloads_map, layout=layout)
plotly.offline.iplot(fig, validate=False)
pio.write_image(fig, 'images/downloads_distribution.png')

In [None]:
download_data.sort_values(by='id', ascending=False).head(n=10)

# Analysis of CSO topic requests

In [None]:
formats = topics_req_df.groupby('format')[['id']].nunique()

pie = go.Pie(values=formats.id,
              labels=formats.index,
              textposition='inside',
              name='Format',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO topic requests format fragmentation')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)
pio.write_image(fig, 'images/requests_formats.png')

In [None]:
hits = topics_req_df.groupby('exists')[['id']].nunique()

pie = go.Pie(values=hits.id,
              labels=hits.index,
              textposition='inside',
              name='Hit',
              hoverinfo='label+value+name',
              hole=.4)

donut = go.Layout(title='CSO topics requests hits')
fig = go.Figure(data=[pie], layout=donut)
plotly.offline.iplot(fig)
pio.write_image(fig, 'images/requests_hits.png')

In [None]:
requests_data = topics_req_df.groupby(['lat', 'lon'])['id'].count().reset_index()
users_data = topics_req_df.groupby(['country'])['user'].nunique().reset_index()

user_map = dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = users_data['country'],
        z = users_data['user'],
        text = users_data['user'],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            title = '#unique users'),
      )

bubbles = dict(
    type = 'scattergeo',
    lon = requests_data['lon'],
    lat = requests_data['lat'],
    text = requests_data['id'],
    marker = dict(
        size = requests_data['id'],
        line = dict(width=0.5, color='rgb(40,40,40)'),
        sizemode = 'area'
    ))

layout = dict(
        title = 'Unique user and topic requests',
        showlegend = False,
        geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
    )

fig = dict(data=[user_map, bubbles], layout=layout)
plotly.offline.iplot(fig, validate=False)
pio.write_image(fig, 'images/requests_distribution.png')

In [None]:
bars_data = topics_req_df[topics_req_df.exists == '1'].groupby('topic')[['id']].count().sort_values(by='id', ascending=False).reset_index()
bars_data = bars_data[0:50]

trace0 = go.Bar(
    x = bars_data['topic'],
    y = bars_data['id'])

layout = go.Layout(title='Top-50 topics (hits)',
                   xaxis=dict(tickangle=-45,
                             automargin=True,
                             tickfont=dict(size=9)),
                   yaxis=dict(title='# requests',
                              type='log'))
                   
fig = go.Figure(data=[trace0], layout=layout)
plotly.offline.iplot(fig)
pio.write_image(fig, 'images/topics_tip50.png')

In [None]:
bars_data = topics_req_df[topics_req_df.exists == '0'].groupby('topic')[['id']].count().sort_values(by='id', ascending=False).reset_index()
bars_data = bars_data[0:50]

trace0 = go.Bar(
    x = bars_data['topic'],
    y = bars_data['id'],
    marker = dict(color='#cf0a5f'))

layout = go.Layout(title='Top-50 topics (hits)',
                   xaxis=dict(tickangle=-45,
                             automargin=True,
                             tickfont=dict(size=9)),
                   yaxis=dict(title='# requests',
                              type='log'))

fig = go.Figure(data=[trace0], layout=layout)
plotly.offline.iplot(fig)
