In [3]:
import pandas as pd
import numpy as np
import os

import torch
from torch.utils.data import TensorDataset, Dataset

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import Trainer, TrainingArguments

os.environ["WANDB_PROJECT"] = "NDE GPT"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [4]:
mendeley = pd.read_csv('../datasets/nde_prod/Mendeley.csv', lineterminator='\n')

In [5]:
mendeley.columns

Index(['_id', 'Date', 'Description', 'includedInDataCatalog', 'Name', 'URL',
       'Abstract'],
      dtype='object')

In [6]:
ids = ['Mendeley_h6trntnthr', 'Mendeley_db2zgbxwmj', 'Mendeley_bx3c6yvrjs', 'Mendeley_n3y4czk55y']

In [7]:
mendeley_ads = mendeley[mendeley['_id'].isin(ids)]
mendeley_ads


Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract
17807,Mendeley_bx3c6yvrjs,2020-11-20,Wethrift helps shoppers save millions of dolla...,Mendeley,macys coupon code,https://data.mendeley.com/datasets/bx3c6yvrjs,
20507,Mendeley_db2zgbxwmj,2023-06-27,"Cara Hack IG, Hack Instagram Twitter Tiktok O...",Mendeley,Cara Hack IG Mudah Lewat HP,https://data.mendeley.com/datasets/db2zgbxwmj,
26250,Mendeley_h6trntnthr,2023-04-05,Cara Hack FB - Android iPhone 2023. Yaitu meng...,Mendeley,Cara Hack FB - Android iPhone 2023,https://data.mendeley.com/datasets/h6trntnthr,
33970,Mendeley_n3y4czk55y,2023-05-08,1. UiPath\nPATH\n\nUiPath is a global software...,Mendeley,RPA stock data,https://data.mendeley.com/datasets/n3y4czk55y,


In [8]:
dataverse = pd.read_csv('../datasets/nde_prod/Harvard Dataverse.csv', lineterminator='\n')

ids = ['Dataverse_10.7910_DVN_LFZEVI', 'Dataverse_10.7910_DVN_2TAGUY']

dataverse_ads = dataverse[dataverse['_id'].isin(ids)]
dataverse_ads


Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract
15501,Dataverse_10.7910_DVN_2TAGUY,2021-04-01,Do you want to buy Julva cream at discount pri...,Harvard Dataverse,Data of Julva Discount Coupon for Dr Anna cabe...,https://doi.org/10.7910/DVN/2TAGUY,
48699,Dataverse_10.7910_DVN_LFZEVI,2020-03-17,Here is the latest discount code for Power Sug...,Harvard Dataverse,Replication Data for: Power Suggest Pro Coupon...,https://doi.org/10.7910/DVN/LFZEVI,


In [9]:
ads = pd.concat([mendeley_ads, dataverse_ads])
ads = ads.assign(Label=1)
ads


Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract,Label
17807,Mendeley_bx3c6yvrjs,2020-11-20,Wethrift helps shoppers save millions of dolla...,Mendeley,macys coupon code,https://data.mendeley.com/datasets/bx3c6yvrjs,,1
20507,Mendeley_db2zgbxwmj,2023-06-27,"Cara Hack IG, Hack Instagram Twitter Tiktok O...",Mendeley,Cara Hack IG Mudah Lewat HP,https://data.mendeley.com/datasets/db2zgbxwmj,,1
26250,Mendeley_h6trntnthr,2023-04-05,Cara Hack FB - Android iPhone 2023. Yaitu meng...,Mendeley,Cara Hack FB - Android iPhone 2023,https://data.mendeley.com/datasets/h6trntnthr,,1
33970,Mendeley_n3y4czk55y,2023-05-08,1. UiPath\nPATH\n\nUiPath is a global software...,Mendeley,RPA stock data,https://data.mendeley.com/datasets/n3y4czk55y,,1
15501,Dataverse_10.7910_DVN_2TAGUY,2021-04-01,Do you want to buy Julva cream at discount pri...,Harvard Dataverse,Data of Julva Discount Coupon for Dr Anna cabe...,https://doi.org/10.7910/DVN/2TAGUY,,1
48699,Dataverse_10.7910_DVN_LFZEVI,2020-03-17,Here is the latest discount code for Power Sug...,Harvard Dataverse,Replication Data for: Power Suggest Pro Coupon...,https://doi.org/10.7910/DVN/LFZEVI,,1


In [10]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-cased', torch_dtype=torch.float16)

# Modify the last layer for binary classification
num_labels = 2  # Number of classes for binary classification
model.config.num_labels = num_labels
model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
random_mendeley_sample = mendeley.sample(n=3)
random_dataverse_sample = dataverse.sample(n=3)

data = pd.concat([random_mendeley_sample, random_dataverse_sample]).assign(Label=1)
data 

Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract,Label
3147,Mendeley_3k6zwrktsd,2020-03-09,"Xiong, Q. L., Luo, X.J., Xiao, Y., Liang, P.H....",Mendeley,"Data from: Fire from policy, human interventio...",https://data.mendeley.com/datasets/3k6zwrktsd,,1
44407,Mendeley_v852rn2mv6,2022-01-05,The dataset contains perceptions of Vietnamese...,Mendeley,Vietnamese teachers' perceptions of the implem...,https://data.mendeley.com/datasets/v852rn2mv6,,1
20511,Mendeley_db52syhgr8,2019-09-22,African map showing the investigated countries...,Mendeley,Figure 1. African map showing the investigated...,https://data.mendeley.com/datasets/db52syhgr8,,1
32425,Dataverse_10.7910_DVN_E2XIRQ,2020-12-05,TIAS 11-909 First signed 09/09/2011 Last signe...,Harvard Dataverse,"Executive Agreements Database, Statement Conce...",https://doi.org/10.7910/DVN/E2XIRQ,,1
33454,Dataverse_10.7910_DVN_EKHTI6,2021-10-25,“Review of Economics and Statistics: Forthcomi...,Harvard Dataverse,Replication Data for: Spending Response to a P...,https://doi.org/10.7910/DVN/EKHTI6,,1
32174,Dataverse_10.7910_DVN_DXXA2N,2020-02-06,"Data and codebooks from the project """"Rural In...",Harvard Dataverse,Rural Institutional Innovation: Can Village Co...,https://doi.org/10.7910/DVN/DXXA2N,,1


In [12]:
training_data = pd.concat([data, ads])
training_data

Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract,Label
3147,Mendeley_3k6zwrktsd,2020-03-09,"Xiong, Q. L., Luo, X.J., Xiao, Y., Liang, P.H....",Mendeley,"Data from: Fire from policy, human interventio...",https://data.mendeley.com/datasets/3k6zwrktsd,,1
44407,Mendeley_v852rn2mv6,2022-01-05,The dataset contains perceptions of Vietnamese...,Mendeley,Vietnamese teachers' perceptions of the implem...,https://data.mendeley.com/datasets/v852rn2mv6,,1
20511,Mendeley_db52syhgr8,2019-09-22,African map showing the investigated countries...,Mendeley,Figure 1. African map showing the investigated...,https://data.mendeley.com/datasets/db52syhgr8,,1
32425,Dataverse_10.7910_DVN_E2XIRQ,2020-12-05,TIAS 11-909 First signed 09/09/2011 Last signe...,Harvard Dataverse,"Executive Agreements Database, Statement Conce...",https://doi.org/10.7910/DVN/E2XIRQ,,1
33454,Dataverse_10.7910_DVN_EKHTI6,2021-10-25,“Review of Economics and Statistics: Forthcomi...,Harvard Dataverse,Replication Data for: Spending Response to a P...,https://doi.org/10.7910/DVN/EKHTI6,,1
32174,Dataverse_10.7910_DVN_DXXA2N,2020-02-06,"Data and codebooks from the project """"Rural In...",Harvard Dataverse,Rural Institutional Innovation: Can Village Co...,https://doi.org/10.7910/DVN/DXXA2N,,1
17807,Mendeley_bx3c6yvrjs,2020-11-20,Wethrift helps shoppers save millions of dolla...,Mendeley,macys coupon code,https://data.mendeley.com/datasets/bx3c6yvrjs,,1
20507,Mendeley_db2zgbxwmj,2023-06-27,"Cara Hack IG, Hack Instagram Twitter Tiktok O...",Mendeley,Cara Hack IG Mudah Lewat HP,https://data.mendeley.com/datasets/db2zgbxwmj,,1
26250,Mendeley_h6trntnthr,2023-04-05,Cara Hack FB - Android iPhone 2023. Yaitu meng...,Mendeley,Cara Hack FB - Android iPhone 2023,https://data.mendeley.com/datasets/h6trntnthr,,1
33970,Mendeley_n3y4czk55y,2023-05-08,1. UiPath\nPATH\n\nUiPath is a global software...,Mendeley,RPA stock data,https://data.mendeley.com/datasets/n3y4czk55y,,1


In [13]:
training_data['Name_Description'] = training_data['Name'].str.cat(training_data['Description'], sep=' - ')
training_data

Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract,Label,Name_Description
3147,Mendeley_3k6zwrktsd,2020-03-09,"Xiong, Q. L., Luo, X.J., Xiao, Y., Liang, P.H....",Mendeley,"Data from: Fire from policy, human interventio...",https://data.mendeley.com/datasets/3k6zwrktsd,,1,"Data from: Fire from policy, human interventio..."
44407,Mendeley_v852rn2mv6,2022-01-05,The dataset contains perceptions of Vietnamese...,Mendeley,Vietnamese teachers' perceptions of the implem...,https://data.mendeley.com/datasets/v852rn2mv6,,1,Vietnamese teachers' perceptions of the implem...
20511,Mendeley_db52syhgr8,2019-09-22,African map showing the investigated countries...,Mendeley,Figure 1. African map showing the investigated...,https://data.mendeley.com/datasets/db52syhgr8,,1,Figure 1. African map showing the investigated...
32425,Dataverse_10.7910_DVN_E2XIRQ,2020-12-05,TIAS 11-909 First signed 09/09/2011 Last signe...,Harvard Dataverse,"Executive Agreements Database, Statement Conce...",https://doi.org/10.7910/DVN/E2XIRQ,,1,"Executive Agreements Database, Statement Conce..."
33454,Dataverse_10.7910_DVN_EKHTI6,2021-10-25,“Review of Economics and Statistics: Forthcomi...,Harvard Dataverse,Replication Data for: Spending Response to a P...,https://doi.org/10.7910/DVN/EKHTI6,,1,Replication Data for: Spending Response to a P...
32174,Dataverse_10.7910_DVN_DXXA2N,2020-02-06,"Data and codebooks from the project """"Rural In...",Harvard Dataverse,Rural Institutional Innovation: Can Village Co...,https://doi.org/10.7910/DVN/DXXA2N,,1,Rural Institutional Innovation: Can Village Co...
17807,Mendeley_bx3c6yvrjs,2020-11-20,Wethrift helps shoppers save millions of dolla...,Mendeley,macys coupon code,https://data.mendeley.com/datasets/bx3c6yvrjs,,1,macys coupon code - Wethrift helps shoppers sa...
20507,Mendeley_db2zgbxwmj,2023-06-27,"Cara Hack IG, Hack Instagram Twitter Tiktok O...",Mendeley,Cara Hack IG Mudah Lewat HP,https://data.mendeley.com/datasets/db2zgbxwmj,,1,"Cara Hack IG Mudah Lewat HP - Cara Hack IG, H..."
26250,Mendeley_h6trntnthr,2023-04-05,Cara Hack FB - Android iPhone 2023. Yaitu meng...,Mendeley,Cara Hack FB - Android iPhone 2023,https://data.mendeley.com/datasets/h6trntnthr,,1,Cara Hack FB - Android iPhone 2023 - Cara Hack...
33970,Mendeley_n3y4czk55y,2023-05-08,1. UiPath\nPATH\n\nUiPath is a global software...,Mendeley,RPA stock data,https://data.mendeley.com/datasets/n3y4czk55y,,1,RPA stock data - 1. UiPath\nPATH\n\nUiPath is ...


In [14]:
X, y = training_data['Name_Description'], training_data['Label']

In [15]:
X=[x[0] for x in np.vstack(X).astype(str)]
y=np.vstack(y).astype(int)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [18]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

encodings = tokenizer(X, padding=True, truncation=True, max_length=512, return_tensors='pt')
dataset = TextDataset(encodings, y)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzubairqazi[0m. Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
