# Evolution of patents in the world

In [33]:
import pandas as pd
from iso3166 import countries
import pycountry
from branca.colormap import linear

import folium
import json

import sys
path_data = "./../data/"
sys.path.append(path_data)

Patent authority where the application was filled

In [2]:
data = pd.read_csv("./../data/patents_per_country.csv", sep=";",
                  names=["country_short", "year", "n_patents"])[1:]

In [3]:
data.head()

Unnamed: 0,country_short,year,n_patents
1,DK,1967,3305.0
2,US,1939,44222.0
3,BG,1989,1360.0
4,VN,1999,1.0
5,SU,1911,6.0


In [4]:
data.year.min()

'1782'

In [5]:
country_codes = pd.read_csv("../data/country_codes.csv", sep=";")

In [6]:
country_codes.head()

Unnamed: 0,country,country_short
0,AFGHANISTAN,AF
1,ALBANIA,AL
2,ALGERIA,DZ
3,ANDORRA,AD
4,ANGOLA,AO


In [7]:
data = data.merge(country_codes, on='country_short')
data.year = pd.to_numeric(data.year)
data.n_patents = pd.to_numeric(data.n_patents)

In [8]:
data.head()

Unnamed: 0,country_short,year,n_patents,country
0,DK,1967,3305.0,DENMARK
1,DK,1961,2981.0,DENMARK
2,DK,1964,3243.0,DENMARK
3,DK,1970,2914.0,DENMARK
4,DK,1993,8453.0,DENMARK


In [9]:
data.n_patents.sum()

70414723.0

In [10]:
# Total number of patents per country
sums = data.groupby(["country", "year"]).sum()
sums.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_patents
country,year,Unnamed: 2_level_1
AFGHANISTAN,1976,1.0
AFGHANISTAN,9999,2.0
ALBANIA,1995,1.0
ALBANIA,1996,1.0
ALBANIA,2010,1.0


## Cleaning 
There are too much countrie with year `9999`.  

In [11]:
sums[sums.index.get_level_values('year') == 9999].head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_patents
country,year,Unnamed: 2_level_1
AFGHANISTAN,9999,2.0
ALBANIA,9999,9.0
ALGERIA,9999,3.0
ANDORRA,9999,7.0
ANGOLA,9999,1.0
ARGENTINA,9999,145.0
ARMENIA,9999,34.0
AUSTRALIA,9999,51280.0
AUSTRIA,9999,5319.0
AZERBAIJAN,9999,23.0


we decided to ignored them first, but to take only top 50 countries (overall patents number) because some countries are unsignificant compared to the top.

## TOP 50

In [12]:
rank = pd.DataFrame(data.groupby(["country_short",'country']).n_patents.sum()).sort_values(by="n_patents", axis=0, ascending=False)

In [13]:
top50 = rank[0:50]
top50.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_patents
country_short,country,Unnamed: 2_level_1
JP,JAPAN,15316682.0
US,UNITED STATES,14948130.0
DE,GERMANY,5972070.0
CN,CHINA,5968600.0
GB,UNITED KINGDOM,3515530.0


How many of them don't exist anymore or have changed the name.

In [14]:
top50 = top50.reset_index()

list_na = []
for  c in top50.country:
    try:
        countries.get(c)
    except:
        list_na.append(c)
                

In [15]:
list_na

['UNION OF SOVIET SOCIALIST REPUBLICS',
 'GERMAN DEMOCRATIC REPUBLIC',
 'CZECHOSLOVAKIA',
 'YUGOSLAVIA']

## Mapping historic countries to existing ones


In [16]:
dic_country_na = dict([('RUSSIAN FEDERATION', 'UNION OF SOVIET SOCIALIST REPUBLICS'), 
                    ('GERMANY', 'GERMAN DEMOCRATIC REPUBLIC'), 
                    ('CZECH REPUBLIC', 'CZECHOSLOVAKIA'), 
                    ('SLOVAKIA','CZECHOSLOVAKIA'), 
                    ('CROATIA','YUGOSLAVIA'), 
                    ('SLOVENIA','YUGOSLAVIA'), 
                    ('MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF','YUGOSLAVIA'),
                    ('BOSNIA AND HERZEGOVINA','YUGOSLAVIA')])

In [17]:
# Create new Dataframe with the data from country who doesn't exist anymore. 
# Assign it to countries who replaced them. 


data_temp = sums.reset_index()

country_hist = []
year_hist = []
n_patents_hist = []

for c in dic_country_na:
    old = dic_country_na.get(c)
    yrs = data_temp[data_temp.country == old].year
    for y in yrs:
        val_ = sums.loc[old,y].n_patents
        country_hist.append(c)
        year_hist.append(y)
        n_patents_hist.append(val_)
data_hist = pd.DataFrame({'country':country_hist, 'year': year_hist, 'n_patents':n_patents_hist})

# Delete the old country
for pays in list_na:
    sums = sums.drop(pays)

In [18]:
# Concat the two dataframe (historic countries with new names and current countries)
data_curr = sums.reset_index()
data_ud = pd.concat([data_curr, data_hist], axis=0, ignore_index=True)

In [19]:
data_ud = data_ud.merge(country_codes, on='country')

In [20]:
data_ud.head()

Unnamed: 0,country,n_patents,year,country_short
0,AFGHANISTAN,1.0,1976,AF
1,AFGHANISTAN,2.0,9999,AF
2,ALBANIA,1.0,1995,AL
3,ALBANIA,1.0,1996,AL
4,ALBANIA,1.0,2010,AL


In [21]:
is_top50 = data_ud.country_short.isin(top50.country_short)

In [22]:
data_ud = data_ud.ix[is_top50]
data_ud.head()

Unnamed: 0,country,n_patents,year,country_short
54,ARGENTINA,1.0,1904,AR
55,ARGENTINA,1.0,1905,AR
56,ARGENTINA,1.0,1919,AR
57,ARGENTINA,1.0,1920,AR
58,ARGENTINA,2.0,1921,AR


In [23]:
patents = data_ud.groupby(["country_short", "year","country"]).sum()

In [24]:
patents.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_patents
country_short,year,country,Unnamed: 3_level_1
AR,1904,ARGENTINA,1.0
AR,1905,ARGENTINA,1.0
AR,1919,ARGENTINA,1.0
AR,1920,ARGENTINA,1.0
AR,1921,ARGENTINA,2.0


# Function

Extract only the specific year from `patents` dataframe.

In [25]:
def get_year(yr):
    df_byYear = patents[patents.index.get_level_values('year') == yr]
    df_byYear = df_byYear.reset_index()
    del df_byYear['year']
    alpha3 = [countries.get(c)[2] for c in df_byYear.country_short]
    df_byYear['code'] = alpha3
    df_byYear = df_byYear.rename(columns={'n_patents': 'n_patents_'+str(yr)})
    return df_byYear

In [26]:
pat_2010 = get_year(2010)
pat_2010.head()

Unnamed: 0,country_short,country,n_patents_2010,code
0,AR,ARGENTINA,4603.0,ARG
1,AT,AUSTRIA,4254.0,AUT
2,AU,AUSTRALIA,30748.0,AUS
3,BE,BELGIUM,727.0,BEL
4,BG,BULGARIA,185.0,BGR


In [34]:
def fill_empty_country(df, year):
    """
    In case some countries don't appear in the DF, add them with 0 patents.
    """
    
    # get the current countries
    df = df.reset_index()
    del df['country_short']
    del df['country']
    current_countries = list(set(df["code"]))

    # see which cantons miss
    
    def difference(a, b):
        """
        For two data structures A and B, this function outputs the elements of A that are not in B
        """
        return list(set(a) - set(b))

    empty_country = difference(JSON_COUNTRIES_ALL, current_countries)
    
    # build a dataframe with these missing countries with 0 patents.
    
    empty_country_df = pd.DataFrame(data=list(zip(empty_country, [0]*len(empty_country))),
                                columns=["code","n_patents_"+str(year)])
    
    # append this to current dataframe and reindex
    
    df = pd.concat([df, empty_country_df])
    df['n_patents_'+str(year)] = pd.to_numeric(df['n_patents_'+str(year)], errors='coerce')
    df = df[['code', 'n_patents_'+str(year)]]
    
    return df

In [35]:
# convert the dataframe to dict
def make_dict(year):
    df = get_year(year)
    df_all = fill_empty_country(df, year)
    
    dict_ = df_all.set_index('code')['n_patents_'+str(year)]
    
    return dict_


In [36]:
countries_geo = json.load(open(path_data+'countries.geo.json'))

In [37]:
JSON_COUNTRIES_ALL = [i['id'] for i in countries_geo['features']]

In [38]:
make_dict(2010).head()

code
ARG     4603.0
AUT     4254.0
AUS    30748.0
BEL      727.0
BGR      185.0
Name: n_patents_2010, dtype: float64

# Visualization

## Plot 

In [39]:
%matplotlib inline
import matplotlib.pyplot as plt

from ipywidgets import interact, Play, widgets

from bokeh.layouts import row, column, layout, gridplot
from bokeh.models import ColumnDataSource, DataRange1d, Select, Legend, HoverTool, BoxSelectTool,WheelZoomTool,SaveTool, WidgetBox, LinearColorMapper
from bokeh.palettes import Blues4
from bokeh.plotting import figure
from bokeh.charts import Histogram, Bar
from bokeh.io import push_notebook, show, output_notebook, curdoc, hplot

from math import pi

import seaborn as sns
output_notebook()

In [40]:
def plot_per_country(ctry, start_year):
    df = pd.DataFrame(sums.loc[(ctry, )])
    df = df[df.index <= 2016]
    df = df[df.index >= start_year]
    plt.plot(df.index, df.n_patents);
    print("# of patents from that date :", df.n_patents.sum())

In [None]:
interact(plot_per_country, ctry=list(set([c for c,y in sums.index.values[::-1]])), start_year=(1950,2016,1))

## World MAP

### MAP ver.1

In [41]:
def viz_map(year):

    data = get_year(year)
    data_dict = make_dict(year)    
    
    colormap = linear.YlOrRd.scale(
    data['n_patents_'+str(year)].min(),
    data['n_patents_'+str(year)].max())
    
    color_dict = {key: colormap(data_dict[key]) for key in data_dict.keys()}

    
    # MAP 
    m = folium.Map(location=[46.7303575,8.2950065], zoom_start=2)
    folium.GeoJson(
        countries_geo,
        style_function=lambda feature: {
            
            #'fillColor': color_dict[feature['id']],
            'fillColor': 'white' if data_dict[feature['id']] == 0.0 else color_dict[feature['id']],
            'color': 'black',
            'weight': 1,
            'fillOpacity': 0.9
        }
    ).add_to(m)
        
    


    return m

### MAP ver.2

In [42]:
import folium
import folium.plugins

from folium.features import *
class DivIcon(MacroElement):
    def __init__(self, html='', size=(30,30), anchor=(0,0), style=''):
        """TODO : docstring here"""
        super(DivIcon, self).__init__()
        self._name = 'DivIcon'
        self.size = size
        self.anchor = anchor
        self.html = html
        self.style = style

        self._template = Template(u"""
            {% macro header(this, kwargs) %}
              <style>
                .{{this.get_name()}} {
                    {{this.style}}
                    }
              </style>
            {% endmacro %}
            {% macro script(this, kwargs) %}
                var {{this.get_name()}} = L.divIcon({
                    className: '{{this.get_name()}}',
                    iconSize: [{{ this.size[0] }},{{ this.size[1] }}],
                    iconAnchor: [{{ this.anchor[0] }},{{ this.anchor[1] }}],
                    html : "{{this.html}}",
                    });
                {{this._parent.get_name()}}.setIcon({{this.get_name()}});
            {% endmacro %}
            """)

In [143]:
def viz_map2(year, scale, suisse):
    import math
    import numpy as np
    data = get_year(year)
    data_dict = make_dict(year)
    data_all = fill_empty_country(data, year)
    
    colormap = linear.YlOrRd.scale(
    data_all['n_patents_'+str(year)].min(),
    data_all['n_patents_'+str(year)].max())
    
    color_dict = {key: colormap(data_dict[key]) for key in data_dict.keys()}
    
    #NB: For reveiwers, if you run, modify folium.py by setting the threshold limit to >10
    
    scale_log = list(np.logspace(0, math.log(data_all['n_patents_'+str(year)].max(), 9), 9, endpoint=True))
    scale_lin = list(np.linspace(0, data_all['n_patents_'+str(year)].max(), 6, endpoint=True))
    # MAP 
    m = folium.Map(location=[46.7303575,8.2950065], zoom_start=5 if suisse == True else 2)
    
    m.choropleth(
    geo_str=open(path_data+'countries.geo.json').read(),
    data=data_all,
    columns=['code', 'n_patents_'+str(year)],
    key_on='feature.id',
    fill_color='YlOrRd',
    threshold_scale= scale_lin if scale == 'lin' else scale_log,
    fill_opacity=0.7,
    line_opacity=0.2,
    )
    

    folium.map.Marker(
    [47, -10] if suisse == True else [0,-150],
    icon=DivIcon(
        size=(150,36),
        anchor=(0,0),
        html=str(year),
        style="""
            font-size:72px;
            background-color: transparent;
            border-color: transparent;
            text-align: right;
            """
        )
    ).add_to(m)
    
    

    return m

In [174]:
viz_map2(2010,'log', False)

# Screenshot of maps

In [172]:
missingyears = [2003]
len(missingyears)

2

In [173]:
import os
import time
from selenium import webdriver

for i in missingyears:
    m = viz_map2(i,'log', True)
    
    delay=5

    #Save the map as an HTML file
    fn='testmap'+str(i)+'_CH.html'
    tmpurl='file://{path}/{mapfile}'.format(path=os.getcwd(),mapfile=fn)
    m.save(fn)

    #Open a browser window...
    browser = webdriver.Chrome()
    #..that displays the map...
    browser.get(tmpurl)
    #Give the map tiles some time to load
    time.sleep(delay)
    #Grab the screenshot
    browser.save_screenshot('map'+str(i)+'_CH.png')
    #Close the browser
    browser.quit()