[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/cdrc1103/MasterThesis/blob/master/Experiments/MultiLabel/SynonymsDataSet.ipynb)

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import json
from pathlib import Path
import pandas as pd

# PARAMETERS FOR EXPERIMENT CONFIGURATION
BASE_DIR = Path(f"gdrive/MyDrive/Colab Notebooks/Thesis")

class Parameter():
  def __init__(self):
    self.__dict__ = {
      "project_name": "thesis_multi-label",
      "train_dataset": "train_A0.csv",
      "test_dataset": "test_A0.csv",
      "embedding_dataset": "patent-300.vec", # 
      "max_token_length": 100, # number of words/subwords, an instance is truncated to
      "max_tokens": 39000, # vocabulary size
      "file_size": 7084721, # or 999995 number of embedding vectors in file
      "embedding_dim": 300, # dimension of the pre-trained embeddings
      "batch_size": 128,
      "learning_rate": 1e-3,
      "epochs": 2,
      "seed": 1, # random seed for reproducability
      "logging": False, # whether to log to the wandb database (True) or to disk (False)
      "n_classes": 18, # how many unique labels there are in the data set
      "in_feature": "abstract", # feature used for training
      "out_feature": "label", # feature to predict
      "output_size": 128, # output size of the language processing layer, i.e. the CNN, GRU etc. layer
      "dropout_rate": 0.1, # for regularization
      "kernel_size": 5, # size of the CNN kernel
      "threshold": 0.5
    }

PARAMS = Parameter()

# Read data sets
train_df = pd.read_csv(BASE_DIR / PARAMS.train_dataset, index_col=0)
train_df = train_df.sample(frac=1, random_state=PARAMS.seed)

# Filter relevant classes

x_train = train_df[PARAMS.in_feature]
y_train = train_df[train_df.columns.difference([PARAMS.in_feature], sort=False)].to_numpy()

# Print stats
print(f"Train data set:{len(train_df)} instances")

Train data set:102133 instances


In [None]:
""" Dependencies """
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences

""" Create TF datasets """
# Transform to tensorflow data set 
train_ds = tf.data.Dataset.from_tensor_slices((x_train, train_df.index))\
                                              .batch(PARAMS.batch_size)\
                                              .prefetch(tf.data.AUTOTUNE)

""" Build embeddings """
# Parse file
def get_coefs(word, *arr): 
  return word, np.asarray(arr, dtype='float32')
#embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(open(BASE_DIR/PARAMS.embedding_dataset), total=PARAMS.file_size))
print("Found %s word vectors in the embedding file." % len(embeddings_index))

# Configure the tokenizer settings
vectorizer = TextVectorization(max_tokens=PARAMS.max_tokens, output_sequence_length=PARAMS.max_token_length)
vectorizer.adapt(train_ds.map(lambda text, index, *args: text))

# Tokenize data set
def tokenize_text(text, index, *args):
  text = tf.expand_dims(text, -1)
  return (vectorizer(text), index)
train_ds = train_ds.map(tokenize_text)

# Build lookup table of word/index mappings
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

# Prepare embedding matrix
num_tokens = len(voc)
hits = 0
misses = 0
embedding_matrix = np.zeros((num_tokens, PARAMS.embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

Found 7084721 word vectors in the embedding file.


In [None]:
# Create token index
token_index = {v: k for k,v in word_index.items()}

In [None]:
# convert tf dataset to numpy
tokens = np.zeros([len(train_df),PARAMS.max_token_length], dtype=int)
index = []
counter = 0
for element in train_ds.as_numpy_iterator():
  tokens[counter: counter+element[0].shape[0], :] = element[0]
  counter+=PARAMS.batch_size

In [None]:
# generate stopword list
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sp = stopwords.words('english')
sp_vec = vectorizer(sp).numpy()[:,0]
sp_vec = np.array([i for i in sp_vec if i != 1])
sp_vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array([   91,  4194,  6477,  8613, 11440, 16133, 18130,  8016, 21617,
        7534, 33036, 18255, 10916, 38210,   141,   193,   193,  2052,
        1061,   839,   340,  5169,  5605,    23,  3708, 20980,   142,
          26,   347,  1349,  2403,     7,    22,  1580,  3519,    19,
         630,    89,   155,    31,  6779,    33,  1391,   433, 26963,
       12791,     3,     9,     2,     5,   531,   852,    10,  1359,
          21,  1177,   332,     4,    16,    12,    11,    17,    96,
         483,   118,    87,   133,   372,   912,   158,   444,   800,
           6,    18,   455,  1282,     8,   293,    36,   979,   502,
         403,  3101,   143,   338,  2066,  4242,   610,   103,   236,
       31002,  8363,   529,   442,   435,   132,  5275,    53,  1299,
         123,   792,    51,   201,  4152,   119,   612,  4882,   188,
         115,    93,  5361,   907,   772,  1118,    29,  1639,  4533,
       33540, 29786,  4422,  8428,   215,  5233,   521,   784,  8252,
        7728,   795,

In [None]:
# get nearest neighbor and create synonym dict
from sklearn.neighbors import NearestNeighbors

synonyms_number = 5
word_number = embedding_matrix.shape[0]

nn = NearestNeighbors(n_neighbors=synonyms_number+1, n_jobs=4).fit(embedding_matrix)
neighbours_mat = nn.kneighbors(embedding_matrix[1:word_number])[1]


In [None]:
# save synonym dict
import pickle
synonyms = {x[0]: x[0:] for x in neighbours_mat}
with open(BASE_DIR/'synonyms.pickle', 'wb') as handle:
    pickle.dump(synonyms, handle)

In [None]:
#print(token_index.get(204))
[token_index[t] for t in synonyms.get(206)]

['sheet', 'sheets', 'paper', 'sheetlike', 'sheetform', 'strip']

In [None]:
def get_synonyms(n, ids, tokens):
  syn_sentences = []
  syn_ids = []
  r1 = np.random.randint(0, synonyms_number+1, size=[n, tokens.shape[1]]) # random synonym
  r2 = np.random.randint(0,tokens.shape[0], size=n) # random sequence
  for i in range(n):
    sequence = tokens[r2[i],:]
    for j in range(PARAMS.max_token_length):
      word = sequence[j]
      if not word in sp_vec:
        syns = synonyms.get(word)
        if syns is not None:
          sequence[j] = syns[r1[i,j]]
    syn_sentences.append(" ".join([token_index.get(t) for t in sequence]).rstrip())
    syn_ids.append(ids[r2[i]])
  return syn_sentences, syn_ids

In [None]:
THRESHOLD = 3000
FEATURE = "abstract"
x_train = x_train.rename(FEATURE, inplace=True)
subset_list = []
for i in tqdm(np.arange(y_train.shape[1])):
  ids = np.where(y_train[:,i]==1)[0]
  token_subset = tokens[ids]
  x_subset = x_train.iloc[ids].to_frame()
  patent_ids = x_train.index
  y_subset = pd.DataFrame(y_train[ids], index=patent_ids[ids])
  if len(ids) < threshold:
    n_samples = threshold-len(ids)
    syn_sentences, syn_ids = get_synonyms(n_samples, ids, token_subset)
    y_train_syn = pd.DataFrame(y_train[syn_ids], index=patent_ids[syn_ids])
    x_train_syn = pd.DataFrame(syn_sentences, index=patent_ids[syn_ids], columns=[FEATURE])
    x_subset = pd.concat([x_subset, x_train_syn], axis=0)
    y_subset = pd.concat([y_subset, y_train_syn], axis=0)
  subset = pd.concat([x_subset, y_subset], axis=1)
  subset_list.append(subset)
train_syn = pd.concat(subset_list, axis=0)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [None]:
train_syn.abstract

patentid
EP2216263A1         The package has a bottom wall for supporting a...
AU2017203644B2      H1:\aar r..rovenNRPorthlMCC\AARU?:465115S2_1.d...
WO2007015964A1      A laminated negative pressure wound dressing s...
HK1256111A1         The present invention relates to exosomes with...
CN109288738A        The invention relates to the technical field o...
                                          ...                        
EP3709637A1         in an some a that uppermost keratinisation whi...
AU2006279800B2      71 some a further and a that representing swin...
WO2020194365A1      this the for a a nutraceutically of customizab...
US8271992B2         a because for insufficient deformable and a ea...
BR112016026660A2    an hence the for the because of dermopharmaceu...
Name: abstract, Length: 148202, dtype: object

In [None]:
x_train.loc['EP3709637A1']

"In an embodiment, a virtual hair coloration system includes: a projector 22 configured to project digital content including a makeup application tutorial onto the user's hair; and a dynamic mapping unit 24; 30 operably coupled to the projector, wherein the dynamic mapping unit is configured to establish a dynamic correspondence between pixels of the projector 22 and features of the user's hair."

In [None]:
train_syn.to_csv(BASE_DIR / "train_A3000SYN.csv")

557

In [None]:
[token_index[t] for t in synonyms.get(word_index["virtual"])]

['virtual', 'reality', 'real', 'configuring', 'shared', 'accessing']

In [3]:
%%capture
pip install translators --upgrade

In [14]:
import pandas as pd
import requests

url = 'https://free-proxy-list.net/'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

proxy_df = pd.read_html(r.text)[0].dropna()
proxy_df

Unnamed: 0,IP Address,Port,Code,Country,Anonymity,Google,Https,Last Checked
0,129.146.249.135,80.0,US,United States,elite proxy,no,yes,1 minute ago
1,192.200.220.56,3128.0,US,United States,anonymous,no,yes,1 minute ago
2,161.202.226.194,80.0,JP,Japan,elite proxy,no,yes,1 minute ago
3,5.252.161.48,8080.0,GB,United Kingdom,anonymous,no,no,1 minute ago
4,175.143.37.162,80.0,MY,Malaysia,anonymous,no,no,1 minute ago
...,...,...,...,...,...,...,...,...
295,49.248.152.240,80.0,IN,India,anonymous,no,no,52 minutes ago
296,191.100.20.14,8080.0,EC,Ecuador,anonymous,no,yes,53 minutes ago
297,176.110.121.90,21776.0,RU,Russian Federation,anonymous,no,yes,53 minutes ago
298,119.82.252.122,36182.0,KH,Cambodia,elite proxy,no,no,53 minutes ago


In [21]:
proxy_df["IP Address"].iloc[r]

'45.234.5.82'

In [30]:
import numpy as np
subset_list = []
for i in np.arange(y_train.shape[1]):
  ids = np.where(y_train[:,i]==1)[0]
  if len(ids) < 3000:
    subset_list.append(x_train.iloc[ids])
abstracts = pd.concat(subset_list)

In [32]:
len(abstracts)

11788

In [34]:
r

296

In [None]:
abstracts = abstracts.apply(lambda x: " ".join(x.split()[0:100]))

In [37]:
import translators as ts
import random
from tqdm.notebook import tqdm
import asyncio

r = random.randint(0,len(df))
l_codes =  ["fr", "it", "el", "es", "bg", "cs", "da", "zh-CN"]
translation_dict = {}
abstracts = abstracts.apply(lambda x: " ".join(x.split()[0:100]))
for a, id in zip(abstracts, tqdm(abstracts.index)):
  re_translation = []
  for l in l_codes:
    r = random.randint(0,len(df)-1)
    random_proxy=proxy_df["IP Address"].iloc[r]
    translation = ts.google(a, from_language="en", to_language=l, proxies={"http":f"http://{random_proxy}"}, sleep_seconds=0)
    re_translation.append(ts.google(translation, to_language="en"))
  translation_dict[id] = re_translation

async def main():
    print(await translator.translate(text, target_lang=deepl.TargetLang.Japanese))

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

HBox(children=(FloatProgress(value=0.0, max=11788.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:

import asyncio

import deepl

text = 'I have a pen.'

translator = deepl.Translator(deepl.AiohttpAdapter('Your API key'))


async def main():
    print(await translator.translate(text, target_lang=deepl.TargetLang.Japanese))

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

In [35]:
help(ts.google)

Help on method google_api in module translators.apis:

google_api(query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs) -> Union[str, list] method of translators.apis.GoogleV2 instance
    https://translate.google.com, https://translate.google.cn.
    :param query_text: str, must.
    :param from_language: str, default 'auto'.
    :param to_language: str, default 'en'.
    :param **kwargs:
            :param if_use_cn_host: boolean, default None.
            :param if_ignore_limit_of_length: boolean, default False.
            :param is_detail_result: boolean, default False.
            :param proxies: dict, default None.
            :param sleep_seconds: float, default `random.random()`.
    :return: str or list



In [26]:
translation_dict

{'CN107714559A': ['The invention proposes a skin care product without deeply moisturizing antispical. The skin care product comprises a humectant conditioning agent and the skin. Humacy includes isomeric saccharides and sodium hyaluronate.Skin packaging comprises hydrolyzed silk proteins.The saccharide isomer, sodium hyaluronate and hydrolysed silk proteins are used for internal hydration and externalskins. The skin care product can penetrate the lipid layer of hair follicles.The skin to provide protein and water from the skin through the conditioning of hydrolyzed silk proteins and sodium hyaluronate, the elasticity of the skin is improved and the skin becomes soft and smooth.The saccharides isomer.can lock the',
  'The invention provides a product for deeply moisturizing antiseptic skin. The product for skin care includes a humectant and a skin conditioning agent. The umination includes isomers of saccharides and sodium ialoronta.The conditioning agent of theLeather includes hydrolys

In [None]:
pd.set_option('display.max_colwidth', 500)

In [None]:
print(a)

The present invention relates to a liquid lipstick container having an applicator, the surface of which has been subjected to flocking so as to improve the spreadability of a liquid lipstick, and having a symmetric spreading hole formed thereon such that the liquid lipstick can be applied evenly.


In [None]:
pd.DataFrame(re_translation)

Unnamed: 0,0
0,The rumor of liquid lepics is composed of the lypsy converter that the driver of driving deficiency in such a consistent consequences of the distribution of the distribution of the distribution of the distribution of the distribution of the distribution of agreement.
1,"The current invention is a liquid lipstick container having a rod, and the surface that brings him flows to improve the spread of lipstick lipstick."
2,"This inventory contains liquid lips that are limited liquid liquid liquid liquid, then grumps to spread the liquid liquid and make a liquid liper to make a chip lipstry."
3,The present invention relates to a liquid lipstick container with an applicator whose surface has been subjected to neat to improve the spread of liquid lipstick and formed a symmetrical spreading hole so that liquid lipstick can be applied evenly.
4,"The current invention refers to the container of liquid lipstick, the surface in which the surface was transferred to improve the lipstick of liquid lipstick, and it has a symmetrical spreading hole created equally."
5,"The present invention relates to a liquid lipstick container having an applicator, wherein the surface has a symmetrical spreading hole having a symmetric spreading hole having a symmetrical spread hole for improving the spread of liquid lipstick and can be applied evenly to a liquid lipstickOn ...."
6,"A labeling of the label forms a fluent container, which is an applicator having a surfacing expansion in order to improve the transferring fluorescence and, after being aimed at one-simmetric agents, whether the liquid rust can be applied equally."
7,"The present invention relates to a liquid lipstick having an applicator, which has been subjected to flocking to improve the spreadability of liquid lipstick and form a symmetrical diffusion hole thereon such that liquid lip balm can be uniformly applied."
