In [1]:
import pandas as pd
import numpy as np
import openpyxl

# Set up custom functions for use

In [2]:
def examine_data(df, nrow=5):
    print("Number of rows: {}\n".format(df.shape[0]))
    print("Columns: {}\n\n".format(list(df.columns)))
    return df.head(nrow)

import re
def print_columns_of_all_datasets(data, search=r'.*'): 
    for key in data.keys():
        cols = data[key].columns[data[key].columns.str.contains(search,flags=re.I)]
        print("***Columns of '{}':*** \n {}\n".format(key,list(cols)))
    return None

def clean_df_cols(df, cols_to_use_map):
    cols_to_rename = [x for x in list(df.columns) 
                      if (x not in list(cols_to_use_map.values())
                          and x in list(cols_to_use_map.keys()))]
    m = dict(zip(cols_to_rename, [cols_to_use_map[col] for col in cols_to_rename]))
    df = df.rename(columns=m).loc[:,list(cols_to_use_map.values())]
    return df

# Load in Excel sheets data

In [3]:
xl = pd.ExcelFile('Convatec data.xlsx')
data = xl.parse(xl.sheet_names) # parse in each sheet to data dictioanry
keys = list(data.keys())
keys

['China Patent',
 'PCT',
 'TW patent',
 'IPC Class',
 'Applicant Names',
 'China Patent (Soopat)']

### Choose the datasets to work on

- `China Patent`, `PCT`, `IPC`

In [4]:
keys = ['China Patent', 'PCT', 'IPC Class']
data = {k:v for k,v in data.items() if k in keys}
data['IPC'] = data.pop('IPC Class')
data.keys()

dict_keys(['China Patent', 'PCT', 'IPC'])

# Deine and clean the columns to use for all datasets

In [5]:
print_columns_of_all_datasets(data)

***Columns of 'China Patent':*** 
 ['申请号', '申请号(search)', '主申请类', '类型', 'Agent', 'Unnamed: 5', '申请人', '申请人(soopat)', '地区', '申请日', '公开(公告)号', '公开(公告)日', '名称', '发明名称', 'Unnamed: 14']

***Columns of 'PCT':*** 
 ['Application No.', 'Agent', 'Firm', 'City', 'Country', 'Title', 'PubDate', 'US Application No.', 'Applicant', 'Inventor', 'Int.Class']

***Columns of 'IPC':*** 
 ['主申请类(soopat)', '主申请类链接(soopat)', '分类号(wipo)', '分类号链接(wipo)', '标准分类号', '分类表年份', '部', '部（说明）', '大类', '大类（解释）', '小类', '小类（说明）', '大组', '大组（说明）', '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', '六点小组', '六点小组（说明）']



In [6]:
china_patent_cols_dict = {'申请号':'App_No(Soopat)', '申请号(search)':'App_No', 
                          '主申请类':'IPC(Soopat)', '类型':'Type', 'Agent':'Firm', 
                          '申请人':'Applicant_EN', '申请人(soopat)':'Applicant_CN','地区': 'China_Judct', 
                          '申请日':'AppDate', '公开(公告)号':'Pub_No', 
                          '公开(公告)日':'PubDate', '名称': 'Title_CN',
                          '发明名称': 'Title_EN'}


cols = ['Application No.', 'Agent', 'Firm', 'City', 'Country', 'Title', 'PubDate', 
 'US Application No.', 'Applicant', 'Inventor', 'Int.Class']
newcols = ['App_No(WIPO)', 'Agent', 'Firm', 'City', 'Country', 'Title_EN', 'PubDate', 
             'US_App_No', 'Applicant_EN', 'Inventor', 'IPC(WIPO)']
pct_cols_dict = dict(zip(cols, newcols))


cols = ['主申请类(soopat)', '主申请类链接(soopat)', '分类号(wipo)', 
 '分类号链接(wipo)', '标准分类号', '分类表年份', '部', '部（说明）', 
 '大类', '大类（解释）', '小类', '小类（说明）', '大组', '大组（说明）', 
 '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', 
 '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', 
 '六点小组', '六点小组（说明）']
newcols = ['IPC(Soopat)', 'IPC_Link(Soopat)', 'IPC(WIPO)', 
 'IPC_link(WIPO)', 'IPC', 'IPC_Version', '部', '部（说明）', 
 '大类', '大类（说明）', '小类', '小类（说明）', '大组', '大组（说明）', 
 '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', 
 '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', 
 '六点小组', '六点小组（说明）']
IPC_cols_dict = dict(zip(cols, newcols))

In [7]:
data['China Patent'] = clean_df_cols(data['China Patent'],china_patent_cols_dict)
data['PCT'] = clean_df_cols(data['PCT'],pct_cols_dict)
data['IPC'] = clean_df_cols(data['IPC'],IPC_cols_dict)

# Examine the dataset

In [8]:
for key in data.keys():
    print("'{}'\n".format(key))
    print(examine_data(data[key],2),"\n")

'China Patent'

Number of rows: 69

Columns: ['App_No(Soopat)', 'App_No', 'IPC(Soopat)', 'Type', 'Firm', 'Applicant_EN', 'Applicant_CN', 'China_Judct', 'AppDate', 'Pub_No', 'PubDate', 'Title_CN', 'Title_EN']


    App_No(Soopat)        App_No IPC(Soopat) Type                    Firm  \
0  CN:201730554158  201730554158   24-02(11)   外观  北京康信知识产权代理有限责任公司 11240   
1  CN:201730554195  201730554195   24-02(11)   外观  北京康信知识产权代理有限责任公司 11240   

          Applicant_EN Applicant_CN China_Judct     AppDate        Pub_No  \
0  CONVATEC CO., LTD.;      康维德有限公司          CN  2017.11.10  CN304631416S   
1  CONVATEC CO., LTD.;      康维德有限公司          CN  2017.11.10  CN304631417S   

      PubDate     Title_CN                                           Title_EN  
0  2018.05.15  负压伤口治疗仪用连接器  Connector for negative pressure wound treatmen...  
1  2018.05.15    负压伤口治疗仪用泵  Pump for negative pressure wound treatment ins...   

'PCT'

Number of rows: 80

Columns: ['App_No(WIPO)', 'Agent', 'Firm', 'City', 'Count

# Convert dates

In [9]:
print_columns_of_all_datasets(data, r'date')

***Columns of 'China Patent':*** 
 ['AppDate', 'PubDate']

***Columns of 'PCT':*** 
 ['PubDate']

***Columns of 'IPC':*** 
 []



### Convert dates in `China Patent`

In [10]:
data['China Patent'][['AppDate', 'PubDate']].head()

Unnamed: 0,AppDate,PubDate
0,2017.11.10,2018.05.15
1,2017.11.10,2018.05.15
2,2017.12.22,2018.06.26
3,2008.10.20,2010.12.15
4,2017.11.10,2018.03.20


In [11]:
f = '%Y.%m.%d'
data['China Patent'][['AppDate', 'PubDate']] = (data['China Patent'][['AppDate', 'PubDate']]
                                                .apply(pd.to_datetime,format=f, 
                                                       # return NaT for non-parseable date
                                                       errors='coerce'))
print(data['China Patent'][['AppDate', 'PubDate']].isna().sum())
data['China Patent'][['AppDate', 'PubDate']].head()                                      

AppDate     0
PubDate    15
dtype: int64


Unnamed: 0,AppDate,PubDate
0,2017-11-10,2018-05-15
1,2017-11-10,2018-05-15
2,2017-12-22,2018-06-26
3,2008-10-20,2010-12-15
4,2017-11-10,2018-03-20


In [12]:
data['China Patent'].loc[data['China Patent']['PubDate'].isna(), 'Pub_No'].shape[0]

15

All data with `PubDate` as `NaT` are found with `Pub_No` as `NaN`, so it's not an issue to worry about.

### Convert dates in `PCT`

In [13]:
data['PCT'].loc[:,'PubDate'].head()

0    08.10.2009
1    29.10.2009
2    08.07.2010
3    23.12.2009
4    23.12.2009
Name: PubDate, dtype: object

In [14]:
f = '%d.%m.%Y'
data['PCT'].loc[:,'PubDate']=pd.to_datetime(data['PCT'].loc[:,'PubDate'], format=f, errors='coerce')
print(data['PCT'].loc[:,'PubDate'].isna().sum())
data['PCT'].loc[:,'PubDate'].head()

0


0   2009-10-08
1   2009-10-29
2   2010-07-08
3   2009-12-23
4   2009-12-23
Name: PubDate, dtype: datetime64[ns]

### Add `AppDate` in `PCT`
- Extract `year` from `App_No(WIPO)` 

In [15]:
data['PCT']['AppDate'] = data['PCT']['App_No(WIPO)'].str.extract(r'\/(.*)\/').astype('datetime64')
data['PCT'].head(2)

Unnamed: 0,App_No(WIPO),Agent,Firm,City,Country,Title_EN,PubDate,US_App_No,Applicant_EN,Inventor,IPC(WIPO),AppDate
0,WO/2009/124324,"KRIEGER, Stuart",Bristol-Myers Squibb Company,"New York, NY",US,DRAINABLE OSTOMY POUCH,2009-10-08,PCT/US2009/039764,"CONVATEC TECHNOLOGIES INC,","MURRAY, Kimberly",A61F 5/44,2009-01-01
1,WO/2009/131992,"KRIEGER, Stuart",Bristol-Myers Squibb Company,"New York, NY",US,TEMPORARY OSTOMY APPLIANCE,2009-10-29,PCT/US2009/041237,CONVATEC TECHNOLOGIES INC.,"GREGORY, Christopher",A61F 5/445,2009-01-01


# Remove duplicate patents

In [16]:
subset = ['App_No', 'Pub_No','Type']
data['China Patent'] = data['China Patent'].sort_values(subset).drop_duplicates(subset=subset)
data['PCT'] = data['PCT'].sort_values('App_No(WIPO)').drop_duplicates()

# Merge `China Patent` with `IPC`

In [17]:
data["China_Patent_with_IPC"] = data['China Patent'].merge(data['IPC'], on='IPC(Soopat)').head(2)
data["China_Patent_with_IPC"].head()

Unnamed: 0,App_No(Soopat),App_No,IPC(Soopat),Type,Firm,Applicant_EN,Applicant_CN,China_Judct,AppDate,Pub_No,...,二点小组,二点小组（说明）,三点小组,三点小组（说明）,四点小组,四点小组（说明）,五点小组,五点小组（说明）,六点小组,六点小组（说明）
0,200680027416.6,200680027416,A61F5/01(2006.01)I,授权,中国专利代理(香港)有限公司 72001,,康沃特克科技公司,CN,2006-06-08,,...,,,,,,,,,,
1,200680027793.X,200680027793,A61F5/01(2006.01)I,授权,中国专利代理(香港)有限公司 72001,,康沃特克科技公司,CN,2006-06-08,,...,,,,,,,,,,


In [18]:
cols = ['App_No(Soopat)', 'App_No', 'Type', 'Firm',
       'Applicant_CN', 'AppDate', 'Pub_No',
       'PubDate', 'Title_CN', 'IPC', 'IPC_Link(Soopat)', 
       'IPC_link(WIPO)', 'IPC_Version', '部', '大类',
       '小类', '大组']
data["China_Patent_with_IPC"] = data["China_Patent_with_IPC"][cols]
data["China_Patent_with_IPC"].head(2)

Unnamed: 0,App_No(Soopat),App_No,Type,Firm,Applicant_CN,AppDate,Pub_No,PubDate,Title_CN,IPC,IPC_Link(Soopat),IPC_link(WIPO),IPC_Version,部,大类,小类,大组
0,200680027416.6,200680027416,授权,中国专利代理(香港)有限公司 72001,康沃特克科技公司,2006-06-08,,NaT,足部的压缩装置,A61F5/01,http://www.soopat.com/IPC/Code/A61F5-01,,2006.01,A,A61,A61F,5/00
1,200680027793.X,200680027793,授权,中国专利代理(香港)有限公司 72001,康沃特克科技公司,2006-06-08,,NaT,将压缩提供到肢体的箍套,A61F5/01,http://www.soopat.com/IPC/Code/A61F5-01,,2006.01,A,A61,A61F,5/00


# Merge `PCT` with `IPC`

In [19]:
data['PCT_with_IPC'] = data['PCT'].merge(data['IPC'], on='IPC(WIPO)')
data['PCT_with_IPC'].columns

Index(['App_No(WIPO)', 'Agent', 'Firm', 'City', 'Country', 'Title_EN',
       'PubDate', 'US_App_No', 'Applicant_EN', 'Inventor', 'IPC(WIPO)',
       'AppDate', 'IPC(Soopat)', 'IPC_Link(Soopat)', 'IPC_link(WIPO)', 'IPC',
       'IPC_Version', '部', '部（说明）', '大类', '大类（说明）', '小类', '小类（说明）', '大组',
       '大组（说明）', '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', '三点小组（说明）',
       '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', '六点小组', '六点小组（说明）'],
      dtype='object')

In [20]:
cols = ['App_No(WIPO)', 'AppDate','Agent', 'Firm', 'City', 'Country', 'Title_EN',
       'PubDate', 'US_App_No', 'Applicant_EN', 'IPC', 'IPC(WIPO)', 'IPC_Link(Soopat)', 'IPC_link(WIPO)', 
       'IPC_Version', '部', '大类', '小类', '大组']
data['PCT_with_IPC'] = data['PCT_with_IPC'][cols]
data['PCT_with_IPC'].head(2)

Unnamed: 0,App_No(WIPO),AppDate,Agent,Firm,City,Country,Title_EN,PubDate,US_App_No,Applicant_EN,IPC,IPC(WIPO),IPC_Link(Soopat),IPC_link(WIPO),IPC_Version,部,大类,小类,大组
0,WO/2008/062209,2008-01-01,"MAYS, Julie",Barker Brettell LLP,London,UK,DISSOLUTION AND PROCESSING OF CELLULOSE,2008-05-29,PCT/GB2007/004488,CONVATEC TECHNOLOGIES INC.,A61K47/18,A61K 47/18,,http://www.wipo.int/ipcpub/?symbol=A61K0047180...,2006.01,A,A61,A61K,47/00
1,WO/2008/103788,2008-01-01,"KRIEGER, Stuart",Bristol-Myers Squibb Company,"New York, NY",US,SEAL FOR A RECTAL OR OSTOMY APPLIANCCE,2008-08-28,PCT/US2008/054517,CONVATEC TECHNOLOGIES INC.,A61F5/445,A61F 5/445,http://www.soopat.com/IPC/Code/A61F5-445,,2006.01,A,A61,A61F,5/00


# Export datasets

In [21]:
data.keys()

dict_keys(['China Patent', 'PCT', 'IPC', 'China_Patent_with_IPC', 'PCT_with_IPC'])

In [24]:
import os
outdir = './cleaned'
if not os.path.exists(outdir):
    os.mkdir(outdir)

for key in data.keys():
    fname = outdir + '/' + key + '.csv'
    data[key].to_csv(fname, encoding='utf-8’')