### Experiment 1: Single class classification based on abstract texts

In [2]:
import pandas as pd

labels = pd.read_csv("level1_labels.csv", index_col=0)
features = pd.read_csv("../../Utilities/feature_stats.csv", index_col=0)
feature_stats = pd.concat([features, labels], axis=1)
feature_stats.head()

Unnamed: 0,path,abstract,description,claims,label,level1labels
AP1605A,E:\MLData\thesis\Datasets\LexisNexis\AP1605A.xml,False,False,False,0,
AP1665A,E:\MLData\thesis\Datasets\LexisNexis\AP1665A.xml,True,False,False,1,Skin care
AP1682A,E:\MLData\thesis\Datasets\LexisNexis\AP1682A.xml,True,False,False,1,Active ingredients
AP1904A,E:\MLData\thesis\Datasets\LexisNexis\AP1904A.xml,True,False,False,1,Hair care
AP1937A,E:\MLData\thesis\Datasets\LexisNexis\AP1937A.xml,True,False,False,1,Packaging


In [3]:
feature_stats = feature_stats[feature_stats["level1labels"].notna()] # drop unlabeled patents
feature_stats = feature_stats[feature_stats["abstract"] == 1] # drop patents that don't contain an abstract
print(f"Number of examples: {labels.size}")
print(feature_stats["level1labels"].value_counts())

Number of examples: 220709
Active ingredients              57205
Skin care                       30095
Packaging                       25063
Health care                     21890
Hair care                       21171
Cleansing                       10824
Sun                              8092
Perfume                          5989
Deo                              4243
Non woven                        3185
Devices                          1899
Lip care                         1861
Decorative cosmetic              1828
Manufacturing technology         1737
Shaving                           919
Sustainability                    477
Personalization                    86
Artificial Intelligence (AI)        5
no follow up                        3
IP7 Beiersdorf                      2
Name: level1labels, dtype: int64


In [4]:
# define a set of classes that are excluded (AI has not enough instances, the others make no sense for classification here)
exclude_list = ["Artificial Intelligence (AI)", "no follow up", "IP7 Beiersdorf", "Active ingredients"]
mask = feature_stats['level1labels'].isin(exclude_list)
feature_stats =feature_stats[~mask]
feature_stats["level1labels"].value_counts()

Skin care                   30095
Packaging                   25063
Health care                 21890
Hair care                   21171
Cleansing                   10824
Sun                          8092
Perfume                      5989
Deo                          4243
Non woven                    3185
Devices                      1899
Lip care                     1861
Decorative cosmetic          1828
Manufacturing technology     1737
Shaving                       919
Sustainability                477
Personalization                86
Name: level1labels, dtype: int64

In [13]:
# create dataset

# parse xml files to get features
from PipelineBricks.parse_feature import process_files

if __name__ == "__main__":
    feature_list = ['abstract']
    dataset = process_files(feature_stats, feature_list)


In [5]:
dataset = pd.read_csv("dataset2.csv", index_col=0)
# Add labels to dataset
dataset["label"] = feature_stats["level1labels"]
# Check if there are empty cells in abstract colmn (abstract tag exists in document but no content)
print(dataset[dataset["abstract"].isna() == True])
# drop nan rows once again
dataset = dataset[dataset["abstract"].notna()]

             abstract                label
MY168204A         NaN            Skin care
TW200936176A      NaN  Decorative cosmetic


In [6]:
# convert labels to categorical and create integer codes
dataset["label"] = pd.Categorical(dataset["label"])
dataset["label"] = dataset["label"].cat.codes
dataset

Unnamed: 0,abstract,label
AP1665A,Disclosed is an oral dosage form comprising (i...,13
AP1904A,Light-converting material comprises a europium...,4
AP1937A,A flexible container (1) for holding a liquid ...,9
AP2015008920A0,The invention relates to transdermal therapeut...,5
AP2016009265A0,An antimicrobial composition is disclosed that...,0
...,...,...
YU75202A,"Slabo otiruća, stabilna antiperspirant i/ili d...",8
YU82803A,"Dvofazni, sa kuglicom za nanošenje antiperspir...",9
YU86802A,Emulzije sa malo vode su opisane koje su koris...,10
YU86902A,This invention relates to an anhydrous cosmeti...,2


In [7]:
from langdetect import detect
from tqdm import tqdm
# some texts still seem to be not english
# use language detection to filter them

for index, abstract in zip(dataset.index, tqdm(dataset["abstract"])):
    if not detect(abstract) == 'en':
        dataset.loc[index, "abstract"] = None

dataset

  5%|▍         | 6538/139357 [00:40<13:41, 161.66it/s]


KeyboardInterrupt: 