In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [27]:
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import json
import os
from sklearn.model_selection import train_test_split

In [7]:
def process_API_data(directory):
    
    master_df = []
    counter = 0
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'rb') as file:
            d = pd.DataFrame(pickle.load(file))
        master_df.append(d)
        counter += 1
        print("finished processing file {}; count = {}".format(filename, counter))
    return pd.concat(master_df, ignore_index=True)

In [8]:
API_data_directory = "data/raw_data"
df_raw = process_API_data(API_data_directory)

## Data analysis

In [11]:
# slice dictionary for cpc field
cpc_field_slice_dict = {'kind':                (0 ,2 ),   
                        'application_number':  (2 ,10),  
                        'document_number':     (10,18),
                        'cpc_section':         (18,19), 
                        'cpc_class':           (18,21), # include higher levels
                        'cpc_subclass':        (18,22), # include higher levels
                        'cpc_main_group':      (18,26), # include higher levels
                        'cpc_subgroup':        (18,33), # include higher levels
                        'cpc_version_date':    (33,41), 
                        'cpc_symbol_position': (41,42), 
                        'cpc_value_code':      (42,43), 
                        'cpc_set_group':       (43,46), 
                        'cpc_set_rank':        (46,48)}

In [12]:
# number of claims
len(df_raw)

40000

In [13]:
flattened_cpc = [y for x in df_raw["cpc_codes"] for y in x]

In [14]:
# number of labels
len(flattened_cpc)

271771

In [15]:
# create analytics dataset without any text data
levels = ["cpc_section", "cpc_class", "cpc_subclass", "cpc_main_group", "cpc_subgroup"]
df_analytics = pd.DataFrame([[cpc[value[0]:value[1]] for key, value in cpc_field_slice_dict.items()] 
                             for cpc in flattened_cpc],
                            columns = list(cpc_field_slice_dict.keys()))  

In [16]:
df_analytics.head()

Unnamed: 0,kind,application_number,document_number,cpc_section,cpc_class,cpc_subclass,cpc_main_group,cpc_subgroup,cpc_version_date,cpc_symbol_position,cpc_value_code,cpc_set_group,cpc_set_rank
0,B1,9646314,6405541,F,F25,F25J,F25J 1,F25J 1/001,20130101,F,I,0,0
1,B1,9646314,6405541,F,F25,F25J,F25J 1,F25J 1/0221,20130101,L,I,0,0
2,B1,9646314,6405541,F,F25,F25J,F25J 1,F25J 1/0275,20130101,L,I,0,0
3,B1,9646314,6405541,B,B64,B64G,B64G 1,B64G 1/402,20130101,L,A,0,0
4,B1,9646314,6405541,F,F25,F25J,F25J2205,F25J2205/20,20130101,L,A,0,0


In [17]:
# number of unique values at differente levels
unique_label_dict = {}
for level in levels:
    unique_label_dict[level] = set(df_analytics[level])
    print("number of unique " + level + ": " + str(len(unique_label_dict[level])))

number of unique cpc_section: 9
number of unique cpc_class: 128
number of unique cpc_subclass: 634
number of unique cpc_main_group: 7222
number of unique cpc_subgroup: 83304


In [18]:
df_raw["num_label"] = df_raw["cpc_codes"].apply(lambda x: len(x))

In [19]:
# percent of patents with one label
len(df_raw[df_raw["num_label"] == 1]) / len(df_raw["num_label"])

0.1237

In [20]:
pd.concat([df_raw["num_label"].value_counts(),
           df_raw["num_label"].value_counts(normalize=True).mul(100)
           .round(1).astype(str)+"%"], axis=1, keys=['count', '%'])

Unnamed: 0,count,%
2,5810,14.5%
3,5138,12.8%
1,4948,12.4%
4,4299,10.7%
5,3409,8.5%
...,...,...
181,1,0.0%
167,1,0.0%
134,1,0.0%
135,1,0.0%


## Data preprocess

In [21]:
# text_column values: ['title', 'abstraction', 'claims', 'brief_summary', 'description']
text_columns = ['title', 'abstraction', 'claims']

# label_columns values: ['cpc_section', 'cpc_class', 'cpc_subclass', 'cpc_main_group', 'cpc_subgroup']
label_columns = ['cpc_section', 'cpc_class', 'cpc_subclass']

In [22]:
def extract_labels(cpc_codes, label_columns):
    labels = set()
    for cpc_code in cpc_codes:
        level_label = []
        for label_column in label_columns:
            index = cpc_field_slice_dict[label_column]
            level_label.append(cpc_code[index[0]:index[1]])
        labels.add("--".join(level_label))
    return list(labels)

In [23]:
def tokenize(text):
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens if token not in string.punctuation]

In [34]:
def merge_data(text_columns, label_columns, folder):
    df_text = pd.DataFrame(df_raw['cpc_codes'].apply(extract_labels, args=(label_columns,)))
    df_text['doc_token'] = df_raw[text_columns].agg(' '.join, axis=1).apply(tokenize)
    df_text.columns = ['doc_label', 'doc_token']
    
    df_train, df_valid_test = train_test_split(df_text, test_size=0.3, random_state=1)
    df_valid, df_test = train_test_split(df_valid_test, test_size=0.333, random_state=1)
    
    df_train.to_json(folder + "/train.json", orient='records')
    df_valid.to_json(folder + "/valid.json", orient='records')
    df_test.to_json(folder + "/test.json", orient='records')

In [35]:
merge_data(text_columns, label_columns, 'data')

In [26]:
# check if data is successfully created
with open('data/train.json') as f:
    data = json.load(f)
    print(data[0])

{'doc_label': ['B--B64--B64G', 'F--F25--F25J'], 'doc_token': ['method', 'and', 'device', 'for', 'the', 'production', 'of', 'slush', 'from', 'liquefied', 'gas', 'a', 'method', 'for', 'producing', 'slush', 'from', 'liquefied', 'gas', 'wherein', 'solid', 'crystals', 'are', 'formed', 'and', 'mixed', 'with', 'the', 'liquefied', 'gas', 'to', 'produce', 'slush', 'the', 'solid', 'crystals', 'are', 'produced', 'from', 'liquid', 'particles', 'which', 'are', 'released', 'into', 'or', 'enter', 'a', 'gas', 'atmosphere', 'under', 'pressure', 'wherein', 'the', 'temperature', 'of', 'the', 'gas', 'atmosphere', 'is', 'below', 'the', 'freezing', 'point', 'of', 'the', 'liquid', 'particles', 'a', 'device', 'is', 'also', 'provided', 'for', 'producing', 'the', 'slush', 'from', 'liquefied', 'gas', 'in', 'a', 'cryostat', 'container', 'which', 'is', 'partly', 'filled', 'with', 'the', 'liquefied', 'gas', 'which', 'mixes', 'with', 'the', 'solid', 'crystals', 'to', 'produce', 'slush', 'the', 'device', 'has', 'an',

## Create taxonomy

In [122]:
import pickle
cpc_label_tree_path = 'data/cpc_label_tree.pkl'
taxonomy_path = 'data/cpc.taxonomy'

with open(cpc_label_tree_path,"rb") as f:
    cpc_label_tree = pickle.load(f)

In [154]:
with open(taxonomy_path, "w") as f:
    
    root_dict = cpc_label_tree['Root']
    children = "\t".join(root_dict.keys())
    f.write(f"Root\t{children}\n")
    
    for cpc_section, section_dict in root_dict.items():
        
        children = "\t".join(section_dict.keys())
        f.write(f"{cpc_section}\t{children}\n")
        
        for cpc_class, class_dict in section_dict.items():
            
            children = "\t".join(class_dict.keys())
            f.write(f"{cpc_class}\t{children}\n")
            
            for cpc_subclass, subclass_dict in class_dict.items():
                
                children = "\t".join(subclass_dict.keys())
                f.write(f"{cpc_subclass}\t{children}\n")
                
                for cpc_main_group, main_group_set in subclass_dict.items():
                    
                    children = "\t".join(main_group_set)
                    f.write(f"{cpc_main_group}\t{children}\n")