Run the following once! It just fetches additional data for geolocating IP addresses.

In [1]:
# !curl http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz > ./data/geoip.dat.gz
# !gunzip ./data/geoip.dat.gz

In [2]:
import pygeoip
import pandas as pd
import numpy as np

import pycountry_convert as pycountry

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.io as pio

init_notebook_mode(connected=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

Let's define a function for IP resolution

In [3]:
rawdata = pygeoip.GeoIP('./data/geoip.dat')

def ipquery(ip):
    data = rawdata.record_by_name(ip)
    if data is not None:
        country = data['country_name']
        city = data['city']
        lon = data['longitude']
        lat = data['latitude']
        return [city, country, lat, lon]
    else:
        return [None, None, None, None]

Loading dataframes

In [4]:
lst = pd.read_json('./data/log_topic_request.json').loc[2]['data']
topics_req_df = pd.DataFrame(lst)
topics_req_df.head(n=5)

Unnamed: 0,exists,format,id,time,topic,user
0,1,html,1,2018-07-20 12:53:30,computer science,angelo.salatino@open.ac.uk
1,1,html,2,2018-07-20 13:01:17,evaluation index system,5.45.207.68
2,1,html,3,2018-07-20 13:15:16,alpha phellandrene,66.249.79.98
3,1,html,4,2018-07-20 13:41:17,computer science,195.128.10.106
4,1,html,5,2018-07-20 13:44:44,microprocessor chips,195.128.10.106


In [5]:
topics_req_df.shape[0]

71432

In [6]:
lst = pd.read_json('./data/log_download_request.json').loc[2]['data']
downloads_req_df = pd.DataFrame(lst)
downloads_req_df.head(n=5)

Unnamed: 0,format,id,time,user,version
0,nt,1,2018-10-29 18:02:57,angelo.salatino@open.ac.uk,3.0
1,owl,2,2018-10-29 18:03:00,angelo.salatino@open.ac.uk,3.0
2,csv,3,2018-10-29 18:03:01,angelo.salatino@open.ac.uk,3.0
3,csv,4,2018-10-29 18:03:26,137.108.66.104,3.0
4,owl,5,2018-10-29 18:03:29,137.108.66.104,3.0


In [7]:
downloads_req_df.shape[0]

55

We have to patch email addresses; let's make them point to OU IP address **137.108.113.93** (these users are mainly us from UK and Alex from Germany)

In [8]:
topics_req_df.loc[topics_req_df.user.str.contains('alik'), 'user'] = '137.108.113.93'
topics_req_df.loc[topics_req_df.user.str.contains('@'), 'user'] = '137.108.113.93'
topics_req_df.head()

Unnamed: 0,exists,format,id,time,topic,user
0,1,html,1,2018-07-20 12:53:30,computer science,137.108.113.93
1,1,html,2,2018-07-20 13:01:17,evaluation index system,5.45.207.68
2,1,html,3,2018-07-20 13:15:16,alpha phellandrene,66.249.79.98
3,1,html,4,2018-07-20 13:41:17,computer science,195.128.10.106
4,1,html,5,2018-07-20 13:44:44,microprocessor chips,195.128.10.106


In [9]:
downloads_req_df.loc[downloads_req_df.user.str.contains('@'), 'user'] = '137.108.113.93'
downloads_req_df.head()

Unnamed: 0,format,id,time,user,version
0,nt,1,2018-10-29 18:02:57,137.108.113.93,3.0
1,owl,2,2018-10-29 18:03:00,137.108.113.93,3.0
2,csv,3,2018-10-29 18:03:01,137.108.113.93,3.0
3,csv,4,2018-10-29 18:03:26,137.108.66.104,3.0
4,owl,5,2018-10-29 18:03:29,137.108.66.104,3.0


Done!   
Now, let's create a 4 new columns with the IP geolocation

In [10]:
downloads_req_df[['city', 'country', 'lat', 'lon']] = downloads_req_df.apply(lambda row: pd.Series(ipquery(row['user'])), axis=1)
downloads_req_df.head(n=5)

Unnamed: 0,format,id,time,user,version,city,country,lat,lon
0,nt,1,2018-10-29 18:02:57,137.108.113.93,3.0,Bletchley,United Kingdom,51.9833,-0.7333
1,owl,2,2018-10-29 18:03:00,137.108.113.93,3.0,Bletchley,United Kingdom,51.9833,-0.7333
2,csv,3,2018-10-29 18:03:01,137.108.113.93,3.0,Bletchley,United Kingdom,51.9833,-0.7333
3,csv,4,2018-10-29 18:03:26,137.108.66.104,3.0,,United Kingdom,51.4964,-0.1224
4,owl,5,2018-10-29 18:03:29,137.108.66.104,3.0,,United Kingdom,51.4964,-0.1224


Insert: Version fragmentation + format requested

In [11]:
topics_req_df[['city', 'country', 'lat', 'lon']] = topics_req_df.apply(lambda row: pd.Series(ipquery(row['user'])), axis=1)
topics_req_df.head(n=5)

Unnamed: 0,exists,format,id,time,topic,user,city,country,lat,lon
0,1,html,1,2018-07-20 12:53:30,computer science,137.108.113.93,Bletchley,United Kingdom,51.9833,-0.7333
1,1,html,2,2018-07-20 13:01:17,evaluation index system,5.45.207.68,,Russian Federation,55.7386,37.6068
2,1,html,3,2018-07-20 13:15:16,alpha phellandrene,66.249.79.98,Mountain View,United States,37.4192,-122.0574
3,1,html,4,2018-07-20 13:41:17,computer science,195.128.10.106,,Netherlands,52.3824,4.8995
4,1,html,5,2018-07-20 13:44:44,microprocessor chips,195.128.10.106,,Netherlands,52.3824,4.8995


In [19]:
data = topics_req_df.groupby(['country'])['id'].count().reset_index()

tmp = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data['country'],
        z = np.log10(data['id']),
        text = data['id'],
#         colorscale = [[0,"rgb(0,0,0)"]],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = True,
            title = '#requests'),
      ) ]

layout = dict(
    title = 'Request distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=tmp, layout=layout)
plotly.offline.iplot( fig, validate=False )

In [18]:
data = topics_req_df.groupby(['country'])['user'].nunique().reset_index()

tmp = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data['country'],
        z = np.log10(data['user']),
        text = data['user'],
#         colorscale = [[0,"rgb(0,0,0)"]],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = True,
            title = '#users'),
      ) ]

layout = dict(
    title = 'Request distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=tmp, layout=layout)
plotly.offline.iplot( fig, validate=False )

In [20]:
data = downloads_req_df.groupby(['country'])['id'].count().reset_index()
tmp = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data['country'],
        z = np.log10(data['id']),
        text = data['id'],
#         colorscale = [[0,"rgb(0,0,0)"]],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = True,
            title = '#downloads'),
      ) ]

layout = dict(
    title = 'Download distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=tmp, layout=layout)
plotly.offline.iplot( fig, validate=False )

In [21]:
data = downloads_req_df.groupby(['country'])['user'].nunique().reset_index()
tmp = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data['country'],
        z = np.log10(data['user']),
        text = data['user'],
#         colorscale = [[0,"rgb(0,0,0)"]],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = True,
            title = '#users(downloads)'),
      ) ]

layout = dict(
    title = 'Download distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=tmp, layout=layout)
plotly.offline.iplot( fig, validate=False )

In [43]:
topics_req_df[topics_req_df.exists == '0']

Unnamed: 0,exists,format,id,time,topic,user,city,country,lat,lon
51,0,html,52,2018-07-20 20:00:31,plastic optical fiber,141.8.132.38,,Russian Federation,55.7386,37.6068
61,0,html,62,2018-07-20 21:33:37,computer architecture,141.8.132.38,,Russian Federation,55.7386,37.6068
74,0,html,75,2018-07-21 03:10:47,plastic optical fiber,188.32.9.134,Moscow,Russian Federation,55.7522,37.6156
81,0,html,82,2018-07-21 05:30:33,,37.215.137.111,Borki,Belarus,54.7165,29.4848
82,0,html,83,2018-07-21 05:30:34,computer,37.215.137.111,Borki,Belarus,54.7165,29.4848
83,0,html,84,2018-07-21 05:31:03,,37.215.137.111,Borki,Belarus,54.7165,29.4848
85,0,html,86,2018-07-21 05:31:13,,37.215.137.111,Borki,Belarus,54.7165,29.4848
96,0,html,97,2018-07-21 08:02:57,rapid prototyping systems,141.8.132.38,,Russian Federation,55.7386,37.6068
98,0,html,99,2018-07-21 08:57:23,classify/,141.8.132.38,,Russian Federation,55.7386,37.6068
99,0,html,100,2018-07-21 08:57:24,classify/,37.9.113.60,Moscow,Russian Federation,55.7522,37.6156
