In [100]:
import pandas as pd
import numpy as np
import openpyxl

# Define Custom Functions

In [101]:
# print columns of multiple datasets, allow column name search using regex
def print_columns_of_all_datasets(data, search=r'.*'): 
    import re
    for key in data.keys():
        cols = data[key].columns[data[key].columns.str.contains(search,flags=re.I)]
        print("Columns of '{}':\n {}\n".format(key,list(cols)))
    return None

# rename columns if needed
def clean_df_cols(df, df_cols_dict_map):
    # select columns that need to be renamed
    cols_to_rename = [x for x in list(df.columns) 
                      if (x not in list(df_cols_dict_map.values())
                          and x in list(df_cols_dict_map.keys()))]
    # rename columns
    m = dict(zip(cols_to_rename, [df_cols_dict_map[col] for col in cols_to_rename]))
    df = df.rename(columns=m).loc[:,list(df_cols_dict_map.values())]
    return df

# Load Data

The data are saved in multiple sheets. Load and parse each sheet to data dictionary.

In [102]:
xl = pd.ExcelFile('Convatec data.xlsx')
data = xl.parse(xl.sheet_names) # parse in each sheet to data dictioanry

# sheet names
keys = list(data.keys())
keys

['China Patent',
 'PCT',
 'TW patent',
 'IPC Class',
 'Applicant Names',
 'China Patent (Soopat)']

##### Choose the following datasets to work on

`China Patent`, `PCT`, `IPC`

In [103]:
keys = ['China Patent', 'PCT', 'IPC Class']
data = {k:v for k,v in data.items() if k in keys}
data['IPC'] = data.pop('IPC Class') # rename a key
data.keys()

dict_keys(['China Patent', 'PCT', 'IPC'])

# Rename Columns

In [104]:
print_columns_of_all_datasets(data)

Columns of 'China Patent':
 ['申请号', '申请号(search)', '主申请类', '类型', 'Agent', 'Unnamed: 5', '申请人', '申请人(soopat)', '地区', '申请日', '公开(公告)号', '公开(公告)日', '名称', '发明名称', 'Unnamed: 14']

Columns of 'PCT':
 ['Application No.', 'Agent', 'Firm', 'City', 'Country', 'Title', 'PubDate', 'US Application No.', 'Applicant', 'Inventor', 'Int.Class']

Columns of 'IPC':
 ['主申请类(soopat)', '主申请类链接(soopat)', '分类号(wipo)', '分类号链接(wipo)', '标准分类号', '分类表年份', '部', '部（说明）', '大类', '大类（解释）', '小类', '小类（说明）', '大组', '大组（说明）', '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', '六点小组', '六点小组（说明）']



In [105]:
# old-new column dict for China Patent
china_patent_cols_dict = {'申请号':'App_No(Soopat)', '申请号(search)':'App_No', 
                          '主申请类':'IPC(Soopat)', '类型':'Type', 'Agent':'Firm', 
                          '申请人':'Applicant_EN', '申请人(soopat)':'Applicant_CN','地区': 'China_Judct', 
                          '申请日':'AppDate', '公开(公告)号':'Pub_No', 
                          '公开(公告)日':'PubDate', '名称': 'Title_CN',
                          '发明名称': 'Title_EN'}

# old-new column dict for PCT
cols = ['Application No.', 'Agent', 'Firm', 'City', 'Country', 'Title', 'PubDate', 
 'US Application No.', 'Applicant', 'Inventor', 'Int.Class']
newcols = ['App_No(WIPO)', 'Agent', 'Firm', 'City', 'Country', 'Title_EN', 'PubDate', 
             'US_App_No', 'Applicant_EN', 'Inventor', 'IPC(WIPO)']
pct_cols_dict = dict(zip(cols, newcols))

# old-new column dict for IPC
cols = ['主申请类(soopat)', '主申请类链接(soopat)', '分类号(wipo)', 
 '分类号链接(wipo)', '标准分类号', '分类表年份', '部', '部（说明）', 
 '大类', '大类（解释）', '小类', '小类（说明）', '大组', '大组（说明）', 
 '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', 
 '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', 
 '六点小组', '六点小组（说明）']
newcols = ['IPC(Soopat)', 'IPC_Link(Soopat)', 'IPC(WIPO)', 
 'IPC_link(WIPO)', 'IPC', 'IPC_Version', '部', '部（说明）', 
 '大类', '大类（说明）', '小类', '小类（说明）', '大组', '大组（说明）', 
 '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', 
 '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', 
 '六点小组', '六点小组（说明）']
IPC_cols_dict = dict(zip(cols, newcols))

In [106]:
# assign 
data['China Patent'] = clean_df_cols(data['China Patent'],china_patent_cols_dict)
data['PCT'] = clean_df_cols(data['PCT'],pct_cols_dict)
data['IPC'] = clean_df_cols(data['IPC'],IPC_cols_dict)

# Convert Date Type for Dates

In [107]:
# search columns that contains `date` in names
print_columns_of_all_datasets(data, r'date')

Columns of 'China Patent':
 ['AppDate', 'PubDate']

Columns of 'PCT':
 ['PubDate']

Columns of 'IPC':
 []



## Convert dates in `China Patent`

In [108]:
# check the date columns
data['China Patent'][['AppDate', 'PubDate']].head()

Unnamed: 0,AppDate,PubDate
0,2017.11.10,2018.05.15
1,2017.11.10,2018.05.15
2,2017.12.22,2018.06.26
3,2008.10.20,2010.12.15
4,2017.11.10,2018.03.20


In [109]:
# convert to datetime
f = '%Y.%m.%d'
data['China Patent'][['AppDate', 'PubDate']] = (data['China Patent'][['AppDate', 'PubDate']]
                                                .apply(pd.to_datetime,format=f, 
                                                       # return NaT for non-parseable date
                                                       errors='coerce'))
print(data['China Patent'][['AppDate', 'PubDate']].isna().sum())
data['China Patent'][['AppDate', 'PubDate']].head()                                      

AppDate     0
PubDate    15
dtype: int64


Unnamed: 0,AppDate,PubDate
0,2017-11-10,2018-05-15
1,2017-11-10,2018-05-15
2,2017-12-22,2018-06-26
3,2008-10-20,2010-12-15
4,2017-11-10,2018-03-20


In [110]:
# check if `pub_no` also missing for those with missing `pubdate` 
data['China Patent'][data['China Patent']['PubDate'].isna()]['Pub_No'].isna().all()

True

All records with missing `PubDate` are found with `Pub_No` missing, so it's not an issue to worry about.

## Convert dates in `PCT`

In [111]:
data['PCT'].loc[:,'PubDate'].head()

0    08.10.2009
1    29.10.2009
2    08.07.2010
3    23.12.2009
4    23.12.2009
Name: PubDate, dtype: object

In [112]:
f = '%d.%m.%Y'
data['PCT'].loc[:,'PubDate']=pd.to_datetime(data['PCT'].loc[:,'PubDate'], format=f, errors='coerce')
print(data['PCT'].loc[:,'PubDate'].isna().sum())
data['PCT'].loc[:,'PubDate'].head()

0


0   2009-10-08
1   2009-10-29
2   2010-07-08
3   2009-12-23
4   2009-12-23
Name: PubDate, dtype: datetime64[ns]

## Add `AppDate` in `PCT`

Extract `year` from `App_No(WIPO)` 

In [113]:
data['PCT']['AppDate'] = data['PCT']['App_No(WIPO)'].str.extract(r'\/(.*)\/').astype('datetime64')
data['PCT']['AppDate'].head()

0   2009-01-01
1   2009-01-01
2   2010-01-01
3   2009-01-01
4   2009-01-01
Name: AppDate, dtype: datetime64[ns]

# Remove Duplicate Patents

In [114]:
# Remove patents with `App_No`, `Pub_No` and `Type` all duplicated
subset = ['App_No', 'Pub_No','Type'] # multiple cols for identification of duplicates
data['China Patent'] = data['China Patent'].sort_values(subset).drop_duplicates(subset=subset)
data['PCT'] = data['PCT'].sort_values('App_No(WIPO)').drop_duplicates()

# Merge `China Patent` with `IPC`

In [115]:
# merge two datasets on `IPC(Soopat)`
data["China_Patent_with_IPC"] = data['China Patent'].merge(data['IPC'], on='IPC(Soopat)')
data["China_Patent_with_IPC"].columns

Index(['App_No(Soopat)', 'App_No', 'IPC(Soopat)', 'Type', 'Firm',
       'Applicant_EN', 'Applicant_CN', 'China_Judct', 'AppDate', 'Pub_No',
       'PubDate', 'Title_CN', 'Title_EN', 'IPC_Link(Soopat)', 'IPC(WIPO)',
       'IPC_link(WIPO)', 'IPC', 'IPC_Version', '部', '部（说明）', '大类', '大类（说明）',
       '小类', '小类（说明）', '大组', '大组（说明）', '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）',
       '三点小组', '三点小组（说明）', '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', '六点小组',
       '六点小组（说明）'],
      dtype='object')

In [116]:
# select columns to keep
cols = ['App_No(Soopat)', 'App_No', 'Type', 'Firm',
       'Applicant_CN', 'AppDate', 'Pub_No',
       'PubDate', 'Title_CN', 'IPC', 'IPC_Link(Soopat)', 
       'IPC_link(WIPO)', 'IPC_Version', '部', '大类',
       '小类', '大组']
data["China_Patent_with_IPC"] = data["China_Patent_with_IPC"][cols]

# Merge `PCT` with `IPC`

In [117]:
data['PCT_with_IPC'] = data['PCT'].merge(data['IPC'], on='IPC(WIPO)')
data['PCT_with_IPC'].columns

Index(['App_No(WIPO)', 'Agent', 'Firm', 'City', 'Country', 'Title_EN',
       'PubDate', 'US_App_No', 'Applicant_EN', 'Inventor', 'IPC(WIPO)',
       'AppDate', 'IPC(Soopat)', 'IPC_Link(Soopat)', 'IPC_link(WIPO)', 'IPC',
       'IPC_Version', '部', '部（说明）', '大类', '大类（说明）', '小类', '小类（说明）', '大组',
       '大组（说明）', '一点小组', '一点小组（说明）', '二点小组', '二点小组（说明）', '三点小组', '三点小组（说明）',
       '四点小组', '四点小组（说明）', '五点小组', '五点小组（说明）', '六点小组', '六点小组（说明）'],
      dtype='object')

In [118]:
cols = ['App_No(WIPO)', 'AppDate','Agent', 'Firm', 'City', 'Country', 'Title_EN',
       'PubDate', 'US_App_No', 'Applicant_EN', 'IPC', 'IPC(WIPO)', 'IPC_Link(Soopat)', 'IPC_link(WIPO)', 
       'IPC_Version', '部', '大类', '小类', '大组']
data['PCT_with_IPC'] = data['PCT_with_IPC'][cols]

# Export datasets

In [119]:
data.keys()

dict_keys(['China Patent', 'PCT', 'IPC', 'China_Patent_with_IPC', 'PCT_with_IPC'])

In [120]:
import os
outdir = './cleaned'
if not os.path.exists(outdir):
    os.mkdir(outdir)

for key in data.keys():
    fname = outdir + '/' + key + '.csv'
    data[key].to_csv(fname, encoding='utf-8’')