In [None]:
import nltk
nltk.download()

In [108]:
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import json

In [2]:
file = open('../Toy_Dataset/patent_200k_reparse_1.p', 'rb')
data = pickle.load(file)
file.close()
df_raw = pd.DataFrame(data) 

In [4]:
df_raw = pd.DataFrame(data)

## Data analysis

In [5]:
# slice dictionary for cpc field
cpc_field_slice_dict = {'kind':                (0 ,2 ),   
                        'application_number':  (2 ,10),  
                        'document_number':     (10,18),
                        'cpc_section':         (18,19), 
                        'cpc_class':           (18,21), # include higher levels
                        'cpc_subclass':        (18,22), # include higher levels
                        'cpc_main_group':      (18,26), # include higher levels
                        'cpc_subgroup':        (18,33), # include higher levels
                        'cpc_version_date':    (33,41), 
                        'cpc_symbol_position': (41,42), 
                        'cpc_value_code':      (42,43), 
                        'cpc_set_group':       (43,46), 
                        'cpc_set_rank':        (46,48)}

In [6]:
# number of claims
len(df_raw)

5999

In [7]:
flattened_cpc = [y for x in df_raw["cpc_codes"] for y in x]

In [8]:
# number of labels
len(flattened_cpc)

99039

In [9]:
# create analytics dataset without any text data
levels = ["cpc_section", "cpc_class", "cpc_subclass", "cpc_main_group", "cpc_subgroup"]
df_analytics = pd.DataFrame([[cpc[value[0]:value[1]] for key, value in cpc_field_slice_dict.items()] 
                             for cpc in flattened_cpc],
                            columns = list(cpc_field_slice_dict.keys()))  

In [10]:
df_analytics.head()

Unnamed: 0,kind,application_number,document_number,cpc_section,cpc_class,cpc_subclass,cpc_main_group,cpc_subgroup,cpc_version_date,cpc_symbol_position,cpc_value_code,cpc_set_group,cpc_set_rank
0,B2,13496915,10267362,F,F16,F16C,F16C 33,F16C 33/203,20130101,F,I,0,0
1,B2,13496915,10267362,F,F16,F16C,F16C 33,F16C 33/201,20130101,L,I,0,0
2,B2,13496915,10267362,F,F16,F16C,F16C 33,F16C 33/208,20130101,L,I,0,0
3,B2,13496915,10267362,F,F16,F16C,F16C2220,F16C2220/28,20130101,L,A,0,0
4,B2,13496915,10267362,F,F16,F16C,F16C2326,F16C2326/00,20130101,L,A,0,0


In [11]:
# number of unique values at differente levels
unique_label_dict = {}
for level in levels:
    unique_label_dict[level] = set(df_analytics[level])
    print("number of unique " + level + ": " + str(len(unique_label_dict[level])))

number of unique cpc_section: 9
number of unique cpc_class: 127
number of unique cpc_subclass: 579
number of unique cpc_main_group: 4810
number of unique cpc_subgroup: 38009


In [12]:
df_raw["num_label"] = df_raw["cpc_codes"].apply(lambda x: len(x))

In [13]:
# percent of patents with one label
len(df_raw[df_raw["num_label"] == 1]) / len(df_raw["num_label"])

0.044674112352058676

In [14]:
pd.concat([df_raw["num_label"].value_counts(),
           df_raw["num_label"].value_counts(normalize=True).mul(100)
           .round(1).astype(str)+"%"], axis=1, keys=['count', '%'])

Unnamed: 0,count,%
3,414,6.9%
5,410,6.8%
4,404,6.7%
6,399,6.7%
2,377,6.3%
...,...,...
172,1,0.0%
114,1,0.0%
148,1,0.0%
140,1,0.0%


## Data preprocess

In [99]:
# text_column values: ['title', 'abstraction', 'claims', 'brief_summary', 'description']
text_columns = ['title', 'abstraction', 'claims']

# label_columns values: ['cpc_section', 'cpc_class', 'cpc_subclass', 'cpc_main_group', 'cpc_subgroup']
label_columns = ['cpc_section', 'cpc_class', 'cpc_subclass']

In [100]:
def extract_labels(cpc_codes, label_columns):
    labels = set()
    for cpc_code in cpc_codes:
        level_label = []
        for label_column in label_columns:
            index = cpc_field_slice_dict[label_column]
            level_label.append(cpc_code[index[0]:index[1]])
        labels.add("--".join(level_label))
    return list(labels)

In [None]:
def tokenize(text):
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens if token not in string.punctuation]

In [118]:
def merge_data(text_columns, label_columns, filename):
    df_text = pd.DataFrame(df_raw['cpc_codes'].apply(extract_labels, args=(label_columns,)))
    df_text['doc_token'] = df_raw[text_columns].agg(' '.join, axis=1).apply(tokenize)
    df_text.columns = ['doc_label', 'doc_token']
    return df_text.to_json(filename, orient='records')

In [119]:
merge_data(text_columns, label_columns, 'toy_data.json')

## Create taxonomy

In [122]:
import pickle
cpc_label_tree_path = '../cpc_label_tree.pkl'
taxonomy_path = 'data/cpc.taxonomy'

with open(cpc_label_tree_path,"rb") as f:
    cpc_label_tree = pickle.load(f)

In [125]:
"\t".join(cpc_label_tree['Root'].keys())

'A\tG\tY\tH\tB\tF\tC\tD\tE'

In [154]:
with open(taxonomy_path, "w") as f:
    
    root_dict = cpc_label_tree['Root']
    children = "\t".join(root_dict.keys())
    f.write(f"Root\t{children}\n")
    
    for cpc_section, section_dict in root_dict.items():
        
        children = "\t".join(section_dict.keys())
        f.write(f"{cpc_section}\t{children}\n")
        
        for cpc_class, class_dict in section_dict.items():
            
            children = "\t".join(class_dict.keys())
            f.write(f"{cpc_class}\t{children}\n")
            
            for cpc_subclass, subclass_dict in class_dict.items():
                
                children = "\t".join(subclass_dict.keys())
                f.write(f"{cpc_subclass}\t{children}\n")
                
                for cpc_main_group, main_group_set in subclass_dict.items():
                    
                    children = "\t".join(main_group_set)
                    f.write(f"{cpc_main_group}\t{children}\n")