In [1]:
%pip install transformers datasets evaluate accelerate tensorflow

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Data processing
import pandas as pd

# Modeling
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Model performance evaluation
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


**Load Data**

In [3]:
train_path = "train_for_student.json"
test_path = "test_for_student.json"

train_df = pd.read_json(train_path)
kaggle_test_df = pd.read_json(test_path)

train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,445,446,447,448,449,450,451,452,453,454
Title,Activated carbon derived from bacterial cellul...,The algorithm of static hand gesture recogniti...,Alternative Redundant Residue Number System Co...,Comparative study of wax inhibitor performance...,Undrained lower bound solutions for end bearin...,Words Diffusion an Analysis of across Facebook...,Transformation of time Petri net into Promela,Annual Degradation Rate Analysis of Mono-Si Ph...,Development of Low-Cost in-the-Ear EEG Prototype,Model-based analysis of an integrated zinc-air...,...,Effect of rhenium and cobalt additions on the ...,Wheeling charge calculation with consideration...,Scaling laws for static displacement of linear...,Effect of TMB/P123 ratios on physicochemical p...,Synthetic CaO-based sorbent for high-temperatu...,A portable USB-controlled potentiostat for pap...,Literature reviews on applying artificial inte...,A multi-parameterized water quality prediction...,Semantic Segmentation on Medium-Resolution Sat...,Reducing the defects of a-pillar stamping part...
Abstract,© 2019 Elsevier B.V.Activated carbon derived f...,© Springer International Publishing AG 2018.Te...,© 2018 IEEE.Residue number system (RNS) is a n...,© Published under licence by IOP Publishing Lt...,"© 2019 John Wiley & Sons, Ltd.The undrained be...",© 2018 IEEE.Facebook Pages in Thailand have be...,© 2017 IEEE.This paper proposes a method of tr...,© 2013 IEEE.The annual degradation rate (DR) o...,© 2018 IEEE.This study focused on building a l...,"© 2019 Lao-atiman, Bumroongsil, Arpornwichanop...",...,© Carl Hanser Verlag GmbH & Co. KG.The effect ...,"© 2019 IEEE.In Thailand, Small Power Producers...",© 2018 Elsevier LtdExperimental studies on the...,© 2021 Elsevier LtdThe preparation of mesocell...,© 2018 Hydrogen Energy Publications LLCCalcium...,© 2018 IEEEThis paper presents a portable and ...,Copyright © 2019 for this paper by its authors...,© 2019 The authors and IOS Press. All rights r...,© 2018 IEEE.Semantic Segmentation is a fundame...,© 2019 IEEE.This research aims to reduce defec...
Classes,"[CHE, MATENG]",[CPE],[EE],"[PE, ME, CHE]","[CE, MATSCI]",[CPE],"[CPE, MATH]","[PE, EE, CHE]","[BME, CPE, IE]","[PE, METAL, EE, CPE, CHE, IE, MATH]",...,"[METAL, ME, CHE, MATH, MATSCI]","[ME, EE]","[CE, MATSCI]","[OPTIC, CHE, MATSCI]","[PE, CHE]","[CPE, CHE]","[CPE, EDU]","[ENV, EE, CHE]","[EE, CPE, OPTIC, EDU]","[METAL, EDU, MATSCI]"


Data Preparation

In [4]:
train_df = train_df.T
kaggle_test_data = kaggle_test_df.T
train_df

Unnamed: 0,Title,Abstract,Classes
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,"[CHE, MATENG]"
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,[CPE]
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,[EE]
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,"[PE, ME, CHE]"
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...","[CE, MATSCI]"
...,...,...,...
450,A portable USB-controlled potentiostat for pap...,© 2018 IEEEThis paper presents a portable and ...,"[CPE, CHE]"
451,Literature reviews on applying artificial inte...,Copyright © 2019 for this paper by its authors...,"[CPE, EDU]"
452,A multi-parameterized water quality prediction...,© 2019 The authors and IOS Press. All rights r...,"[ENV, EE, CHE]"
453,Semantic Segmentation on Medium-Resolution Sat...,© 2018 IEEE.Semantic Segmentation is a fundame...,"[EE, CPE, OPTIC, EDU]"


In [5]:
kaggle_test_data

Unnamed: 0,Title,Abstract
001eval,Comparative Electrical Energy Yield Performanc...,© 2013 IEEE.Long-term energy evaluation of PV ...
002eval,Effects of graphene nanoplatelets on bio-based...,© The Author(s) 2021.Novel near-infrared (NIR)...
003eval,Anti-inflammatory action of two novel peptides...,© The Royal Society of Chemistry 2020.Peanut w...
004eval,Efficient all-and-one support vector machines ...,© 2018 IEEE.We introduce a new strategy to est...
005eval,Driver identification using histogram and neur...,© 2017 IEEE.Sensor technology has continuously...
...,...,...
147eval,Utilization of Sewage Sludge from Beverage Ind...,© Published under licence by IOP Publishing Lt...
148eval,Development of a Gateway for OpenADR-ECHONET L...,"© 2018 IEEE.In this paper, we develop an ECHON..."
149eval,Effect of solution treatment and precipitation...,© 2017 Elsevier Ltd. All rights reserved.The a...
150eval,An effect-analysis method for species-dependen...,"© The Authors, published by EDP Sciences, 2019..."


In [6]:
train_df.describe()

Unnamed: 0,Title,Abstract,Classes
count,454,454,454
unique,453,454,284
top,Structure and mechanical properties of ADC 12 ...,© 2019 Elsevier B.V.Activated carbon derived f...,[CPE]
freq,2,1,29


In [7]:
train_df.isnull().sum()

Title       0
Abstract    0
Classes     0
dtype: int64

In [8]:
kaggle_test_data.isnull().sum()

Title       0
Abstract    0
dtype: int64

In [9]:
train_df.Classes.value_counts()

Classes
[CPE]                              29
[EE]                               14
[CPE, MATH]                        14
[METAL, CHE]                        9
[CHE, MATENG]                       8
                                   ..
[ENV, CPE, SAFETY, MATH]            1
[ENV, METAL, ME, EE, NANO, CHE]     1
[CE, CPE, MATENG, MATH, MATSCI]     1
[PE, ME, EE, CPE, MATH]             1
[METAL, EDU, MATSCI]                1
Name: count, Length: 284, dtype: int64

In [10]:
train_df.Classes.value_counts()

Classes
[CPE]                              29
[EE]                               14
[CPE, MATH]                        14
[METAL, CHE]                        9
[CHE, MATENG]                       8
                                   ..
[ENV, CPE, SAFETY, MATH]            1
[ENV, METAL, ME, EE, NANO, CHE]     1
[CE, CPE, MATENG, MATH, MATSCI]     1
[PE, ME, EE, CPE, MATH]             1
[METAL, EDU, MATSCI]                1
Name: count, Length: 284, dtype: int64

In [11]:
train_df['text'] = train_df['Title'] + " " + train_df['Abstract']
train_df.drop(['Title','Abstract'],axis=1, inplace=True)

kaggle_test_data['text'] = kaggle_test_data['Title'] + " " + kaggle_test_data['Abstract']
kaggle_test_data.drop(['Title','Abstract'],axis=1, inplace=True)

In [12]:
train_df.iloc[0][1]

  train_df.iloc[0][1]


'Activated carbon derived from bacterial cellulose and its use as catalyst support for ethanol conversion to ethylene © 2019 Elsevier B.V.Activated carbon derived from bacterial cellulose (BC-AC) was modified with various amounts of H3PO4(x wt% P/BC-AC) and used as a catalyst for the selective dehydration of ethanol to ethylene. The BC-AC obtained at a carbonization temperature of 500 °C had a mesoporous structure with surface area and total pore volume of ~1730 m2/g and 1.0 cm3/g, respectively. An increase in the H3PO4 loading from 5% to 40% increased the number of weak acid sites on the catalyst surface, which consequently enhanced ethanol conversion. At the reaction temperature of 400 °C, the modified BC-AC with 30-40 wt% H3PO4 loading (P/BC-AC) gave an ethanol conversion at 100% and an ethylene selectivity of 100%. A high selectivity for diethyl ether (DEE) at ~ 67% at ethanol conversion of ~ 50% was obtained at 200 °C. Stability tests with a time-on-stream of 12 h, at reaction tem

In [13]:
train_df = train_df.rename(columns={'Classes': 'labels'})

kaggle_test_data = kaggle_test_data.rename(columns={'Classes': 'labels'})

Label Encoder

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

In [15]:
multilabel = MultiLabelBinarizer()

labels = multilabel.fit_transform(train_df['labels']).astype('float32')

texts = train_df['text'].tolist()

In [16]:
print(multilabel.classes_)
labels[0]

['AGRI' 'BME' 'CE' 'CHE' 'CPE' 'EDU' 'EE' 'ENV' 'IE' 'MATENG' 'MATH'
 'MATSCI' 'ME' 'METAL' 'NANO' 'OPTIC' 'PE' 'SAFETY']


array([0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0.], dtype=float32)

In [17]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from datasets import Dataset as HGDataset

In [18]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,
                                                                    test_size=0.05, random_state=42)

Export Train and Validation Data

In [19]:
print(labels[0])

[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [20]:
columns = multilabel.classes_
train_df = pd.DataFrame({'text': train_texts})
train_df[columns] = pd.DataFrame(train_labels, columns=columns)
val_df = pd.DataFrame({'text': val_texts})
val_df[columns] = pd.DataFrame(val_labels, columns=columns)

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('validation_data.csv', index=False)

In [None]:
kaggle_test_data.to_csv('test_data.csv', index=False)

In [21]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=18,
                                                            problem_type="multi_label_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

class PredictDataset(Dataset):
  def __init__(self, texts, tokenizer, max_len=512):
    self.texts = texts
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
    }  


In [23]:
kaggle_test_data["text"]

001eval    Comparative Electrical Energy Yield Performanc...
002eval    Effects of graphene nanoplatelets on bio-based...
003eval    Anti-inflammatory action of two novel peptides...
004eval    Efficient all-and-one support vector machines ...
005eval    Driver identification using histogram and neur...
                                 ...                        
147eval    Utilization of Sewage Sludge from Beverage Ind...
148eval    Development of a Gateway for OpenADR-ECHONET L...
149eval    Effect of solution treatment and precipitation...
150eval    An effect-analysis method for species-dependen...
151eval    Very Short-Term Solar Power Forecast using Dat...
Name: text, Length: 151, dtype: object

Dataset Preparation

In [24]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

predictText = kaggle_test_data['text'].tolist()
hg_kaggle_test_dataset = PredictDataset(predictText, tokenizer)


In [25]:
train_dataset[0]

{'input_ids': tensor([  101,  4026, 17547,  2478,  3086,  2389, 13589,  1011, 15850,  2784,
          4083,  2007,  4926,  7861,  8270,  4667,  1075, 10476, 15368,  1012,
         13367,  1010,  2019,  9414,  5193,  2291,  1006,  2049,  1007,  2003,
          5186,  2590,  1012,  2152, 10640,  4026, 17547,  2038,  2042,  3273,
          1999,  2312,  1011,  4094,  6125,  1012,  2784,  4083,  4725,  2024,
          2583,  2000, 17908,  1996,  3375,  1010,  4800,  1011,  8789,  2951,
          2000,  3073,  1996,  2560,  4026, 17547,  7561,  1012,  2119, 13589,
          1998, 15850, 12530, 15266,  3073,  3278, 13494,  2005,  4026, 17547,
          1012,  6516,  1010,  2057, 16599,  1037,  5257,  1997,  2784,  4083,
          4118,  4294,  2015,  2029,  8676,  1997,  9530,  6767,  7630,  3508,
          2389, 15756,  6125,  1006, 13229,  1007,  1998,  2146,  2460,  1011,
          2744,  3638,  1006,  1048,  3367,  2213,  1007,  2000, 17908, 13589,
          1998, 15850,  2838,  1998, 16

In [26]:
import torch
import numpy as np

metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred, threshold=0.25):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions))
    predictions = (predictions > threshold).int().reshape(-1).numpy()
    labels = labels.reshape(-1).astype(int)
    return metrics.compute(predictions=predictions, references=labels)


In [27]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir = './sentiment_transfer_learning_transformer/',
    num_train_epochs= 20,
    learning_rate= 7e-5,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics
                )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()

 93%|█████████▎| 500/540 [26:45<02:14,  3.37s/it]

{'loss': 0.1987, 'grad_norm': 0.27672135829925537, 'learning_rate': 5.185185185185184e-06, 'epoch': 18.52}


100%|██████████| 540/540 [28:37<00:00,  3.18s/it]

{'train_runtime': 1717.6072, 'train_samples_per_second': 5.019, 'train_steps_per_second': 0.314, 'train_loss': 0.18937459742581403, 'epoch': 20.0}





TrainOutput(global_step=540, training_loss=0.18937459742581403, metrics={'train_runtime': 1717.6072, 'train_samples_per_second': 5.019, 'train_steps_per_second': 0.314, 'train_loss': 0.18937459742581403, 'epoch': 20.0})

In [29]:
trainer.evaluate()

100%|██████████| 2/2 [00:00<00:00,  3.74it/s]


{'eval_loss': 0.2783094644546509,
 'eval_accuracy': 0.8743961352657005,
 'eval_f1': 0.6666666666666666,
 'eval_precision': 0.6265060240963856,
 'eval_recall': 0.7123287671232876,
 'eval_runtime': 1.4619,
 'eval_samples_per_second': 15.732,
 'eval_steps_per_second': 1.368,
 'epoch': 20.0}

In [30]:
y_test_predict = trainer.predict(hg_kaggle_test_dataset)
print(y_test_predict)

100%|██████████| 10/10 [00:07<00:00,  1.30it/s]

PredictionOutput(predictions=array([[-6.5067363 , -4.091949  , -6.149898  , ..., -4.9803066 ,
        -2.3439193 , -6.640198  ],
       [-3.3513722 , -3.3810325 , -2.427452  , ...,  1.0864809 ,
        -2.0086813 , -3.089976  ],
       [-4.393311  ,  0.5097013 , -5.480092  , ..., -3.8833954 ,
        -5.517521  , -5.951025  ],
       ...,
       [-5.259167  , -5.8582997 , -4.020052  , ..., -5.5689774 ,
        -1.8733624 , -5.124386  ],
       [-4.227424  , -0.81715614, -5.3468013 , ..., -4.801284  ,
        -4.8084598 , -4.6802025 ],
       [-3.398658  , -2.5219204 , -2.8232985 , ..., -3.754766  ,
        -2.5146346 , -1.8701813 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 8.6017, 'test_samples_per_second': 17.555, 'test_steps_per_second': 1.163})





In [31]:
# Predicted logits
y_test_logits = y_test_predict.predictions

# First 5 predicted probabilities
y_test_logits[:5]

array([[-6.5067363 , -4.091949  , -6.149898  ,  0.03918166, -3.1426625 ,
        -7.0071917 ,  2.4264336 , -4.8676805 , -4.121815  , -4.7118845 ,
        -4.37934   , -4.744832  , -3.6363046 , -4.121567  , -5.313007  ,
        -4.9803066 , -2.3439193 , -6.640198  ],
       [-3.3513722 , -3.3810325 , -2.427452  ,  2.699453  , -4.0098753 ,
        -4.6978583 , -0.58111805, -1.9748304 , -2.643843  , -0.85790586,
        -3.178998  ,  4.117632  ,  2.2094984 , -1.8726597 ,  0.288822  ,
         1.0864809 , -2.0086813 , -3.089976  ],
       [-4.393311  ,  0.5097013 , -5.480092  ,  1.6881307 , -3.5434852 ,
        -4.7906413 , -3.227446  , -3.9471576 , -5.1206336 , -4.2978005 ,
        -3.7984552 , -1.3181756 , -2.083273  , -4.9335217 , -4.9607973 ,
        -3.8833954 , -5.517521  , -5.951025  ],
       [-7.009564  , -4.93698   , -5.1033206 , -5.147989  ,  2.4783893 ,
        -3.950112  ,  0.1714337 , -5.409963  , -2.57832   , -6.2833104 ,
         1.5390373 , -7.094262  , -5.08216   , -4.905

In [32]:
outputs = torch.tensor(y_test_logits)
probs = torch.sigmoid(outputs)
preds = np.zeros(probs.shape)
print(probs)
preds[np.where(probs>=0.25)] = 1

multilabel.classes_
print(preds)

tensor([[0.0015, 0.0164, 0.0021,  ..., 0.0068, 0.0876, 0.0013],
        [0.0339, 0.0329, 0.0811,  ..., 0.7477, 0.1183, 0.0435],
        [0.0122, 0.6247, 0.0042,  ..., 0.0202, 0.0040, 0.0026],
        ...,
        [0.0052, 0.0028, 0.0176,  ..., 0.0038, 0.1332, 0.0059],
        [0.0144, 0.3064, 0.0047,  ..., 0.0082, 0.0081, 0.0092],
        [0.0323, 0.0743, 0.0561,  ..., 0.0229, 0.0748, 0.1335]])
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [33]:
ids = list(kaggle_test_df.keys())

first_column = pd.DataFrame(ids, columns=['id'])
print(first_column)

          id
0    001eval
1    002eval
2    003eval
3    004eval
4    005eval
..       ...
146  147eval
147  148eval
148  149eval
149  150eval
150  151eval

[151 rows x 1 columns]


In [34]:
columns = multilabel.classes_

predicted_df = pd.DataFrame(preds, columns=columns)

# Reordered columns
new_column_order = ["CE", "ENV", "BME", "PE", "METAL", "ME", "EE", "CPE", "OPTIC", "NANO", "CHE", "MATENG", "AGRI", "EDU", "IE", "SAFETY", "MATH", "MATSCI"]

new_df = pd.DataFrame(columns=new_column_order)

for column in new_column_order:
    new_df[column] = predicted_df[column]

result = pd.concat([first_column, new_df], axis=1)
print(result)

result.to_csv('test.csv' , index=False)


          id   CE  ENV  BME   PE  METAL   ME   EE  CPE  OPTIC  NANO  CHE  \
0    001eval  0.0  0.0  0.0  0.0    0.0  0.0  1.0  0.0    0.0   0.0  1.0   
1    002eval  0.0  0.0  0.0  0.0    0.0  1.0  1.0  0.0    1.0   1.0  1.0   
2    003eval  0.0  0.0  1.0  0.0    0.0  0.0  0.0  0.0    0.0   0.0  1.0   
3    004eval  0.0  0.0  0.0  0.0    0.0  0.0  1.0  1.0    0.0   0.0  0.0   
4    005eval  0.0  0.0  0.0  0.0    0.0  0.0  0.0  1.0    0.0   0.0  0.0   
..       ...  ...  ...  ...  ...    ...  ...  ...  ...    ...   ...  ...   
146  147eval  0.0  1.0  0.0  1.0    0.0  0.0  0.0  0.0    0.0   0.0  1.0   
147  148eval  0.0  0.0  0.0  0.0    0.0  0.0  1.0  1.0    0.0   0.0  0.0   
148  149eval  0.0  0.0  0.0  0.0    1.0  1.0  0.0  0.0    0.0   0.0  1.0   
149  150eval  0.0  0.0  1.0  0.0    0.0  0.0  0.0  0.0    0.0   0.0  1.0   
150  151eval  0.0  1.0  0.0  0.0    0.0  0.0  1.0  0.0    0.0   0.0  0.0   

     MATENG  AGRI  EDU   IE  SAFETY  MATH  MATSCI  
0       0.0   0.0  0.0  0.0     0.0

In [35]:
import os
from datetime import datetime

# Define a timestamp for the current time
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define the directory for saving the model and tokenizer with timestamp
output_dir = f'./sentiment_transfer_learning_transformer/{timestamp}/'

# Create the directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Save tokenizer
tokenizer.save_pretrained(output_dir)

# Save model
trainer.save_model(output_dir)
