### Experiment 1: Single class classification based on abstract texts

In [1]:
import pandas as pd

labels = pd.read_csv("level1_labels.csv", index_col=0)
features = pd.read_csv("../../Utilities/feature_stats.csv", index_col=0)
feature_stats = pd.concat([features, labels], axis=1)
feature_stats.head()

Unnamed: 0,path,abstract,description,claims,label,level1labels
AP1605A,E:\MLData\thesis\Datasets\LexisNexis\AP1605A.xml,False,False,False,0,
AP1665A,E:\MLData\thesis\Datasets\LexisNexis\AP1665A.xml,True,False,False,1,Skin care
AP1682A,E:\MLData\thesis\Datasets\LexisNexis\AP1682A.xml,True,False,False,1,Active ingredients
AP1904A,E:\MLData\thesis\Datasets\LexisNexis\AP1904A.xml,True,False,False,1,Hair care
AP1937A,E:\MLData\thesis\Datasets\LexisNexis\AP1937A.xml,True,False,False,1,Packaging


In [2]:
feature_stats = feature_stats[feature_stats["level1labels"].notna()] # drop unlabeled patents
feature_stats = feature_stats[feature_stats["abstract"] == 1] # drop patents that don't contain an abstract
print(f"Number of examples: {labels.size}")
print(feature_stats["level1labels"].value_counts())

Number of examples: 220709
Active ingredients              57205
Skin care                       30095
Packaging                       25063
Health care                     21890
Hair care                       21171
Cleansing                       10824
Sun                              8092
Perfume                          5989
Deo                              4243
Non woven                        3185
Devices                          1899
Lip care                         1861
Decorative cosmetic              1828
Manufacturing technology         1737
Shaving                           919
Sustainability                    477
Personalization                    86
Artificial Intelligence (AI)        5
no follow up                        3
IP7 Beiersdorf                      2
Name: level1labels, dtype: int64


In [3]:
# define a set of classes that are excluded (AI has not enough instances, the others make no sense for classification here)
exclude_list = ["Artificial Intelligence (AI)", "no follow up", "IP7 Beiersdorf", "Active ingredients"]
mask = feature_stats['level1labels'].isin(exclude_list)
feature_stats =feature_stats[~mask]
feature_stats["level1labels"].value_counts()

Skin care                   30095
Packaging                   25063
Health care                 21890
Hair care                   21171
Cleansing                   10824
Sun                          8092
Perfume                      5989
Deo                          4243
Non woven                    3185
Devices                      1899
Lip care                     1861
Decorative cosmetic          1828
Manufacturing technology     1737
Shaving                       919
Sustainability                477
Personalization                86
Name: level1labels, dtype: int64

In [None]:
# create dataset

# parse xml files to get features
from PipelineBricks.parse_feature import process_files
dataset = pd.DataFrame
if __name__ == "__main__":
    feature_list = ['abstract']
    dataset = process_files(feature_stats, feature_list)


  0%|          | 87/139359 [00:00<11:38, 199.26it/s] 


In [13]:
# Add labels to dataset
dataset["label"] = feature_stats["level1labels"]
# Check if there are empty cells in abstract colmn (abstract tag exists in document but no content)
print(dataset[dataset["abstract"].isna() == True])
# drop nan rows once again
dataset = dataset[dataset["abstract"].notna()]

Empty DataFrame
Columns: [abstract, label, label_encoded]
Index: []


In [15]:
# convert labels to categorical and create integer codes
dataset["label"] = pd.Categorical(dataset["label"])
dataset["label_encoded"] = dataset["label"].cat.codes
# Assigned categories
print(dict(enumerate(dataset["label"].cat.categories)))
dataset[["abstract", "label_encoded"]].to_csv("dataset2.csv")


{0: 'Cleansing', 1: 'Decorative cosmetic', 2: 'Deo', 3: 'Devices', 4: 'Hair care', 5: 'Health care', 6: 'Lip care', 7: 'Manufacturing technology', 8: 'Non woven', 9: 'Packaging', 10: 'Perfume', 11: 'Personalization', 12: 'Shaving', 13: 'Skin care', 14: 'Sun', 15: 'Sustainability'}
