## Objectives

clean `PCT.xlsx` data and then split into two data tables:
        
        - PCT (unique PCT code with `title`, `applicant`, `desginated to CN`, `Filing date` and `link`)
        
        - PCT with IPC (each PCT might have multiple IPC codes, list each pair as one record)

In [24]:
import numpy as np
import pandas as pd
pct = pd.read_excel("./Data/To clean in Jupyter/PCT.xlsx")
pct.head()

Unnamed: 0,PCT,Title,Applicant,Designated to CN,IPC,Filing Date,Link
0,WO2017075268,TROPONIN I AND SOLUBLE UROKINASE RECEPTOR DETE...,ABBOTT LABORATORIES,1,G01N 33/68 (2006.01),27.10.2016,https://patentscope.wipo.int/search/en/detail....
1,WO2008080030,CARDIOVASCULAR AUTOIMMUNE DISEASE PANEL AND ME...,ABBOTT LABORATORIES,1,"G01N 33/53 (2006.01) ,G01N 33/49 (2006.01)",21.12.2007,https://patentscope.wipo.int/search/en/detail....
2,WO1997036902,"TRANS-2,6-, 3,6- AND 4,6-DIAZA-5,6,6a,7,8,12b-...",ABBOTT LABORATORIES,1,"G01N 33/53 (2006.01) ,G01N 33/49 (2006.01)",21.12.2007,https://patentscope.wipo.int/search/en/detail....
3,WO1995027526,CORONARY SINUS CATHETER INTRODUCER SYSTEM,ABBOTT LABORATORIES,0,C07D 471/04 (2006.01),21.03.1997,https://patentscope.wipo.int/search/en/detail....
4,WO1994022858,TETRACYCLIC COMPOUNDS AS DOPAMINE AGONISTS,ABBOTT LABORATORIES,1,"C07D 491/04 (2006.01) ,C07D 495/04 (2006.01) ,...",18.03.1994,https://patentscope.wipo.int/search/en/detail....


## Clean Data

In [25]:
# check dataset - pay attention to those columns with null value
print(pct.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 7 columns):
PCT                 473 non-null object
Title               473 non-null object
Applicant           473 non-null object
Designated to CN    473 non-null int64
IPC                 99 non-null object
Filing Date         100 non-null object
Link                473 non-null object
dtypes: int64(1), object(6)
memory usage: 25.9+ KB
None


In [26]:
# check column for abnormal value, and prepare for cleaning up
pct.PCT.value_counts()

WO2018132617    2
WO2010141173    1
WO2011136815    1
WO2009076224    1
WO2011123306    1
WO2008008437    1
WO2007149457    1
WO2007123872    1
WO2008005277    1
WO2012051195    1
WO2014142808    1
WO2015057289    1
WO2012021513    1
WO2008091747    1
WO2012145106    1
WO2014107291    1
WO2008008137    1
WO2009070209    1
WO2019079780    1
WO2016073630    1
WO2018093663    1
WO2012154842    1
WO2008005390    1
WO2010132258    1
WO2009114326    1
WO1997036902    1
WO1995027526    1
WO2007081550    1
WO2016057402    1
WO2010002604    1
               ..
WO2008016490    1
WO2010091100    1
WO2014152553    1
WO2012037510    1
WO2008002667    1
WO2016090004    1
WO2007146411    1
WO2011011242    1
WO2010111124    1
WO2014018123    1
WO2010085374    1
WO2014159743    1
WO2016003922    1
WO2011022159    1
WO2011019851    1
WO2010141765    1
WO2008076771    1
WO2015191758    1
WO2014113527    1
WO2014028306    1
WO2015048750    1
WO2008008436    1
WO2011137372    1
WO2014106116    1
WO20071462

In [27]:
# Clean data
pct = pct[pct["IPC"].notnull()]
pct["PCT"]=[st.replace('\t','') for st in pct["PCT"]]

## Convert dates

In [28]:
# check if filing date is datetime object
type(pct["Filing Date"][0])

str

In [29]:
# convert str to date
pct["Filing Date"] = pd.to_datetime(pct["Filing Date"], format="%d.%m.%Y")
pct["Filing Date"].head()

0   2016-10-27
1   2007-12-21
2   2007-12-21
3   1997-03-21
4   1994-03-18
Name: Filing Date, dtype: datetime64[ns]

## Split IPC data

In [30]:
# split IPC codes
# insert the splitted codes into multiple rows (with multiple leveled index - use PCT code as index)
IPC_data = pd.DataFrame(pct.IPC.str.split(',').tolist(), index=pct.PCT).stack()
IPC_data.head()

PCT            
WO2017075268  0     G01N 33/68 (2006.01)
WO2008080030  0    G01N 33/53 (2006.01) 
              1     G01N 33/49 (2006.01)
WO1997036902  0    G01N 33/53 (2006.01) 
              1     G01N 33/49 (2006.01)
dtype: object

In [31]:
len(IPC_data)

296

In [32]:
#reset_index of the new df, so that PCT code become a column and no duplicated value later)
IPC_data = IPC_data.reset_index()
IPC_data = IPC_data[["PCT",0]]
IPC_data.head()

Unnamed: 0,PCT,0
0,WO2017075268,G01N 33/68 (2006.01)
1,WO2008080030,G01N 33/53 (2006.01)
2,WO2008080030,G01N 33/49 (2006.01)
3,WO1997036902,G01N 33/53 (2006.01)
4,WO1997036902,G01N 33/49 (2006.01)


In [33]:
# rename the columns
IPC_data.columns = ["PCT", "IPC"]
IPC_data.head()

Unnamed: 0,PCT,IPC
0,WO2017075268,G01N 33/68 (2006.01)
1,WO2008080030,G01N 33/53 (2006.01)
2,WO2008080030,G01N 33/49 (2006.01)
3,WO1997036902,G01N 33/53 (2006.01)
4,WO1997036902,G01N 33/49 (2006.01)


In [34]:
# get IPC subClass
IPC_data["IPC SubClass"] = pd.Series([st[:4] for st in IPC_data["IPC"]])
IPC_data.head()

Unnamed: 0,PCT,IPC,IPC SubClass
0,WO2017075268,G01N 33/68 (2006.01),G01N
1,WO2008080030,G01N 33/53 (2006.01),G01N
2,WO2008080030,G01N 33/49 (2006.01),G01N
3,WO1997036902,G01N 33/53 (2006.01),G01N
4,WO1997036902,G01N 33/49 (2006.01),G01N


In [35]:
IPC_data["IPC SubClass"].unique()

array(['G01N', 'C07D', 'A61K', 'C07C', 'C07K', 'A61M', 'G06F', 'A61P',
       'A23L', 'C08G', 'A61F', 'B29C', 'A61B', 'A61L', 'C22C', 'C22F',
       'C08J', 'G06T', 'G09B', 'B23K', 'C08L', 'F26B', 'B05D', 'B05B',
       'B05C', 'C08H', 'A61N'], dtype=object)

In [36]:
# export PCT-IPC data
IPC_data.to_csv("./Data/Ready for Excel/PCT_IPC.csv")

## Export Clean PCT data

In [37]:
columns = ["PCT", "Title", "Applicant", "Designated to CN", "Filing Date", "Link"]
new_pct = pct[columns]
new_pct.head()

Unnamed: 0,PCT,Title,Applicant,Designated to CN,Filing Date,Link
0,WO2017075268,TROPONIN I AND SOLUBLE UROKINASE RECEPTOR DETE...,ABBOTT LABORATORIES,1,2016-10-27,https://patentscope.wipo.int/search/en/detail....
1,WO2008080030,CARDIOVASCULAR AUTOIMMUNE DISEASE PANEL AND ME...,ABBOTT LABORATORIES,1,2007-12-21,https://patentscope.wipo.int/search/en/detail....
2,WO1997036902,"TRANS-2,6-, 3,6- AND 4,6-DIAZA-5,6,6a,7,8,12b-...",ABBOTT LABORATORIES,1,2007-12-21,https://patentscope.wipo.int/search/en/detail....
3,WO1995027526,CORONARY SINUS CATHETER INTRODUCER SYSTEM,ABBOTT LABORATORIES,0,1997-03-21,https://patentscope.wipo.int/search/en/detail....
4,WO1994022858,TETRACYCLIC COMPOUNDS AS DOPAMINE AGONISTS,ABBOTT LABORATORIES,1,1994-03-18,https://patentscope.wipo.int/search/en/detail....


In [38]:
new_pct.to_csv("./Data/Ready for Excel/clean_PCT.csv")