# Objectives

Clean `PCT.xlsx` data and then split into two data tables:
        
- PCT with IPC (one PCT might have multiple IPC codes, each PCT-IPC pair should be displayed as one record)
- PCT (distinct PCTs with `title`, `applicant`, `desginated to CN`, `Filing date` and `link`)

# Import PCT Data

In [20]:
import numpy as np
import pandas as pd
pct = pd.read_excel("./Data/To clean in Jupyter/PCT.xlsx", engine='openpyxl')
pct.head(2)

Unnamed: 0,PCT,Title,Applicant,Designated to CN,IPC,Filing Date,Link
0,WO2017075268,TROPONIN I AND SOLUBLE UROKINASE RECEPTOR DETE...,ABBOTT LABORATORIES,1,G01N 33/68 (2006.01),27.10.2016,https://patentscope.wipo.int/search/en/detail....
1,WO2008080030,CARDIOVASCULAR AUTOIMMUNE DISEASE PANEL AND ME...,ABBOTT LABORATORIES,1,"G01N 33/53 (2006.01) ,G01N 33/49 (2006.01)",21.12.2007,https://patentscope.wipo.int/search/en/detail....


# Clean Data

In [21]:
# check dataset
print(pct.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PCT               473 non-null    object
 1   Title             473 non-null    object
 2   Applicant         473 non-null    object
 3   Designated to CN  473 non-null    int64 
 4   IPC               473 non-null    object
 5   Filing Date       473 non-null    object
 6   Link              473 non-null    object
dtypes: int64(1), object(6)
memory usage: 26.0+ KB
None


In [22]:
# check if any duplicate PCT due to multiple IPC
(pct.PCT.value_counts() > 1).sum()

1

In [4]:
# remove pct without ipc code
pct = pct[pct["IPC"].notnull()]

# remove tab character
pct["PCT"]=[st.replace('\t','') for st in pct["PCT"]]

# Convert Dates

In [5]:
# check if filing date is datetime object
type(pct["Filing Date"][0])

str

In [23]:
# convert str to date
pct["Filing Date"] = pd.to_datetime(pct["Filing Date"], format="%d.%m.%Y")
pct["Filing Date"].head()

0   2016-10-27
1   2007-12-21
2   2007-12-21
3   1997-03-21
4   1994-03-18
Name: Filing Date, dtype: datetime64[ns]

# Split IPC Data

In [7]:
# split IPC codes
# insert the splitted codes into multiple rows (with multiple leveled index - use PCT code as index)
IPC_data = pd.DataFrame(pct.IPC.str.split(',').tolist(), index=pct.PCT).stack()
IPC_data.head()

PCT            
WO2017075268  0     G01N 33/68 (2006.01)
WO2008080030  0    G01N 33/53 (2006.01) 
              1     G01N 33/49 (2006.01)
WO1997036902  0    G01N 33/53 (2006.01) 
              1     G01N 33/49 (2006.01)
dtype: object

In [8]:
len(IPC_data)

1176

In [9]:
#reset_index of the new df, so that PCT code become a column and no duplicated value later
IPC_data = IPC_data.reset_index()
IPC_data = IPC_data[["PCT",0]]
IPC_data.head()

Unnamed: 0,PCT,0
0,WO2017075268,G01N 33/68 (2006.01)
1,WO2008080030,G01N 33/53 (2006.01)
2,WO2008080030,G01N 33/49 (2006.01)
3,WO1997036902,G01N 33/53 (2006.01)
4,WO1997036902,G01N 33/49 (2006.01)


In [10]:
# rename the columns
IPC_data.columns = ["PCT", "IPC"]
IPC_data.head()

Unnamed: 0,PCT,IPC
0,WO2017075268,G01N 33/68 (2006.01)
1,WO2008080030,G01N 33/53 (2006.01)
2,WO2008080030,G01N 33/49 (2006.01)
3,WO1997036902,G01N 33/53 (2006.01)
4,WO1997036902,G01N 33/49 (2006.01)


In [11]:
# add IPC subClass
IPC_data["IPC SubClass"] = pd.Series([st[:4] for st in IPC_data["IPC"]])
IPC_data.head()

Unnamed: 0,PCT,IPC,IPC SubClass
0,WO2017075268,G01N 33/68 (2006.01),G01N
1,WO2008080030,G01N 33/53 (2006.01),G01N
2,WO2008080030,G01N 33/49 (2006.01),G01N
3,WO1997036902,G01N 33/53 (2006.01),G01N
4,WO1997036902,G01N 33/49 (2006.01),G01N


In [12]:
IPC_data["IPC SubClass"].unique()

array(['G01N', 'C07D', 'A61K', 'C07C', 'C07K', 'A61M', 'G06F', 'A61P',
       'A23L', 'C08G', 'A61F', 'B29C', 'A61B', 'A61L', 'C22C', 'C22F',
       'C08J', 'G06T', 'G09B', 'B23K', 'C08L', 'F26B', 'B05D', 'B05B',
       'B05C', 'C08H', 'A61N', 'C08F', 'C09D', 'B65B', 'B01J', 'C08K',
       'G01G', 'C25F', 'B24C', 'B26F', 'C23C', 'B21D', 'C21D', 'B29B',
       'B29L'], dtype=object)

In [13]:
# export PCT-IPC data
IPC_data.to_csv("./Data/Ready for Excel/PCT_IPC.csv")

# Export Clean PCT Data

In [15]:
columns = ["PCT", "Title", "Applicant", "Designated to CN", "Filing Date", "Link"]
new_pct = pct[columns]
new_pct.head(2)

Unnamed: 0,PCT,Title,Applicant,Designated to CN,Filing Date,Link
0,WO2017075268,TROPONIN I AND SOLUBLE UROKINASE RECEPTOR DETE...,ABBOTT LABORATORIES,1,2016-10-27,https://patentscope.wipo.int/search/en/detail....
1,WO2008080030,CARDIOVASCULAR AUTOIMMUNE DISEASE PANEL AND ME...,ABBOTT LABORATORIES,1,2007-12-21,https://patentscope.wipo.int/search/en/detail....


In [16]:
new_pct.to_csv("./Data/Ready for Excel/clean_PCT.csv")