## Experiment 1: Single label classification based on patent abstract

### Load and inspect data

In [1]:
import pandas as pd

labels = pd.read_csv("level1_labels.csv", index_col=0)
features = pd.read_csv("feature_stats.csv", index_col=0)
feature_stats = pd.concat([features, labels], axis=1)
feature_stats.head()

Unnamed: 0,path,abstract,description,claims,label,level1labels
AP1605A,E:\MLData\thesis\Datasets\LexisNexis\AP1605A.xml,False,False,False,0,
AP1665A,E:\MLData\thesis\Datasets\LexisNexis\AP1665A.xml,True,False,False,1,Skin care
AP1682A,E:\MLData\thesis\Datasets\LexisNexis\AP1682A.xml,True,False,False,1,Active ingredients
AP1904A,E:\MLData\thesis\Datasets\LexisNexis\AP1904A.xml,True,False,False,1,Hair care
AP1937A,E:\MLData\thesis\Datasets\LexisNexis\AP1937A.xml,True,False,False,1,Packaging


In [14]:
feature_stats = feature_stats[feature_stats["level1labels"].notna()] # drop unlabeled patents
feature_stats = feature_stats[feature_stats["abstract"] == 1] # drop patents that don't contain an abstract
print(f"Number of examples: {labels.size}")
print(feature_stats["level1labels"].value_counts())

Number of examples: 220709
Active ingredients              57205
Skin care                       30095
Packaging                       25063
Health care                     21890
Hair care                       21171
Cleansing                       10824
Sun                              8092
Perfume                          5989
Deo                              4243
Non woven                        3185
Devices                          1899
Lip care                         1861
Decorative cosmetic              1828
Manufacturing technology         1737
Shaving                           919
Sustainability                    477
Personalization                    86
Artificial Intelligence (AI)        5
no follow up                        3
IP7 Beiersdorf                      2
Name: level1labels, dtype: int64


It can be seen that some labels are not relevant for the classification and that there is a huge
class imbalance (57205 : 5). First, the irrelevant classes are dropped as well as the AI class because it
appears only five times. Then, we test a weighting mechanism that gives less frequent classes a higher weight
and more frequent classes a lower weight

In [15]:
# define a set of classes that are excluded
# (AI has not enough instances, the others make no sense for classification here)
exclude_list = ["Artificial Intelligence (AI)", "no follow up", "IP7 Beiersdorf", "Active ingredients"]
mask = feature_stats['level1labels'].isin(exclude_list)
feature_stats =feature_stats[~mask]
value_counts = feature_stats["level1labels"].value_counts()
print(value_counts)

Skin care                   30095
Packaging                   25063
Health care                 21890
Hair care                   21171
Cleansing                   10824
Sun                          8092
Perfume                      5989
Deo                          4243
Non woven                    3185
Devices                      1899
Lip care                     1861
Decorative cosmetic          1828
Manufacturing technology     1737
Shaving                       919
Sustainability                477
Personalization                86
Name: level1labels, dtype: int64


In [16]:
# calculate weights
class_weights = {}
total_instances = len(feature_stats)
for class_id, freq in zip(feature_stats["level1labels"].value_counts().index, feature_stats["level1labels"].value_counts()):
    class_weights[class_id] = (1 / freq)*(total_instances)/2.0
class_weights

{'Skin care': 2.3153181591626515,
 'Packaging': 2.780173961616726,
 'Health care': 3.1831658291457288,
 'Hair care': 3.29127107836191,
 'Cleansing': 6.4375,
 'Sun': 8.610912011863569,
 'Perfume': 11.63458006344966,
 'Deo': 16.42222484091445,
 'Non woven': 21.87739403453689,
 'Devices': 36.69273301737756,
 'Lip care': 37.44196668457818,
 'Decorative cosmetic': 38.117888402625816,
 'Manufacturing technology': 40.11485319516408,
 'Shaving': 75.82100108813928,
 'Sustainability': 146.07861635220127,
 'Personalization': 810.2267441860465}

### Parse Patent Files
Parse patent files for their abstract if they're labeled with one of the remaining classes.

In [4]:
# parse xml files to get features
from PipelineBricks.parse_feature import process_files
dataset = pd.DataFrame
if __name__ == "__main__":
    feature_list = ['abstract']
    dataset = process_files(feature_stats, feature_list)

100%|██████████| 139359/139359 [13:22<00:00, 173.71it/s]


It can happen that abstract tags in the cml file of the patent exist but the text value is an empty
string. We need to filter these instances.

In [8]:
# Add labels to dataset
dataset["label"] = feature_stats["level1labels"]
# Check if there are empty cells in abstract column (abstract tag exists in document but no content)
print(dataset[dataset["abstract"].isna() == True])
# drop nan rows once again
dataset = dataset[dataset["abstract"].notna()]

Empty DataFrame
Columns: [abstract, label_encoded, label]
Index: []


In [9]:
from pprint import pprint
# convert labels to categorical and create integer codes
dataset["label"] = pd.Categorical(dataset["label"])
dataset["label_encoded"] = dataset["label"].cat.codes
# Assigned categories
pprint(dict(enumerate(dataset["label"].cat.categories)))

{0: 'Cleansing',
 1: 'Decorative cosmetic',
 2: 'Deo',
 3: 'Devices',
 4: 'Hair care',
 5: 'Health care',
 6: 'Lip care',
 7: 'Manufacturing technology',
 8: 'Non woven',
 9: 'Packaging',
 10: 'Perfume',
 11: 'Personalization',
 12: 'Shaving',
 13: 'Skin care',
 14: 'Sun',
 15: 'Sustainability'}


In [2]:
dataset = pd.read_csv("dataset.csv", index_col=0)

### Correct the class imbalance
Approach 1: Over-sampling (create duplicates) of less frequent classes

In [5]:
# split into train and test part first to avoid that the test set has duplicates of
# of data in train set
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset[["abstract", "label_encoded"]], test_size=0.25,
                               random_state=1, stratify=dataset["label_encoded"])

print(f'instances in train set: \n{train["label_encoded"].value_counts()}')
print(f'instances in test set: \n{test["label_encoded"].value_counts()}')

instances in train set: 
13    22570
9     18797
5     16417
4     15878
0      8118
14     6069
10     4492
2      3182
8      2389
3      1424
6      1396
1      1370
7      1303
12      689
15      358
11       65
Name: label_encoded, dtype: int64
instances in test set: 
13    7524
9     6266
5     5473
4     5293
0     2706
14    2023
10    1497
2     1061
8      796
3      475
6      465
1      457
7      434
12     230
15     119
11      21
Name: label_encoded, dtype: int64


We only care about the train dataset for the resampling.

In [36]:
oversampling_ids = [11, 15, 12]
sampling_factor = 4

# oversampling
min_classes = train[train["label_encoded"].isin(oversampling_ids)]
oversampled = min_classes.sample(n=len(min_classes)*sampling_factor, replace=True, random_state=1)
print(oversampled["label_encoded"].value_counts())

# # undersampling
# maj_classes = train[train["label_encoded"].isin(undersampling_ids)]
# undersampled = maj_classes.sample(n=math.ceil(len(maj_classes)/sampling_factor), replace=True, random_state=1)
# print(undersampled["label_encoded"].value_counts())

12    1880
15     923
11     165
Name: label_encoded, dtype: int64
13    3854
9     3076
4     2685
5     2662
Name: label_encoded, dtype: int64


In [37]:
# concatenate dataframes
train = pd.concat([train, oversampled], axis=0)

### Export

In [6]:
train.to_csv("train.csv")
test.to_csv("test.csv")

