In [None]:
! pip install transformers

In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel
import logging
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
logging.basicConfig(level=logging.ERROR)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchmetrics import F1Score

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 500
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 8
LEARNING_RATE = 1e-05
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)


In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn as nn
import torch.nn.functional as F

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.additional_fc1 = torch.nn.Linear(768, 1024)  
        self.additional_fc2 = torch.nn.Linear(1024, 512)  
        self.classifier = torch.nn.Linear(512, 57)        

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        pooler = F.relu(self.additional_fc1(pooler))
        pooler = F.leaky_relu(self.additional_fc2(pooler))

        output = self.classifier(pooler)
        return output

In [None]:
model = DistilBERTClass()
model.load_state_dict(torch.load('/kaggle/input/distilbert/bert_model_trained_epoch_199.pt'))
model.to(device)

In [None]:
#Combined Code Block
#Formatting the test data
import numpy as np
import pandas as pd
test_df = pd.read_csv('/kaggle/input/test-data1/test.csv')###INSERT TEST DATA HERE###
test_processed = pd.DataFrame()
test_processed["Id"] = test_df['Id']
test_df['Title'] = test_df['Title'].str.lower()
test_df['Abstract'] = test_df['Abstract'].str.lower()
test_processed['Text'] = " " + test_df['Title'] + '\n' + test_df['Abstract']
articles = ['a', 'an', 'the', 'this', 'by', 'that', 'there', 'of', 'on', 'in', 'or', 'and', 'at', 'but', 'therefore', 'henceforth', 'to', 'aims', 'discusses', 'presents', 'considers', 'analyzes', 'explains', 'covers', 'deals', 'with', 'about', 'plays', 'has', 'much', 'been', 'attention', 'have', 'shown', 'several', 'efforts', 'for', 'to', 'it', ',', '.', '!', '', '$', '%', '#', '@']
def remove_words(text):
    for word in articles:
        text = text.replace(f' {word} ', ' ')
    return text.strip()
test_processed['Text'] = test_processed['Text'].apply(remove_words)
common_stop_words = [
    'copyright', 'study', 'researcher', 'materials', 'proceedings', 'university', 'case', 'fax', 'keywords', 'discussed',
    'validates', 'has', 'there', 'dealing', 'has', 'well', 'additionally', 'volume', 'that', 'new', 'several', 'been', 'also',
    'however', 'attention', 'discussion', 'doi', 'validates', 'conclusion', 'address', 'published', 'has', 'several', 'therefore',
    'et', 'evaluating', 'determines', 'novelty', 'his', 'theirs', 'deals', 'solve', 'could', 'shown', 'considers', 'attention',
    'our', 'been', 'discussion', 'being', 'volume', 'concluding', 'aims', 'shows', 'journal', 'thus', 'study', 'it', 'much', 'solving',
    'towards', 'myself', 'have', 'reproduce', 'analyzing', 'at', 'discussing', 'propose', 'but', 'considering', 'documents', 'about',
    'issue', 'analysed', 'to', 'investigation', 'will', 'be', 'shown', 'several', 'evaluation', 'address', 'validation', 'but', 'several',
    'ours', 'hers', 'solves', 'introduction', 'which', 'use', 'novel', 'illustrates', 'shown', 'much', 'novel', 'document', 'addresses',
    'solve', 'solves', 'show', 'examining', 'experiments', 'you', 'concluding', 'has', 'would', 'been', 'evaluation', 'experiments', 'method',
    'showing', 'proposing', 'been', 'have', 'investigation', 'we', 'uses', 'has', 'been', 'there', 'experimental', 'with', 'much', 'studies', 'shown',
    'discussion', 'it', 'novelty', 'a', 'new', 'henceforth', 'investigation', 'showing', 'aim', 'results', 'shown', 'myself', 'there', 'discusses',
    'analyze', 'it', 'him', 'ourselves', 'proposing', 'dealing', 'much', 'much', 'result', 'experiment', 'there', 'been', 'reproduce', 'novelty', 'also',
    'much', 'analyzing', 'shown', 'shown', 'our', 'determine', 'that', 'conclusion', 'herself', 'documented', 'been', 'technique', 'examining', 'much',
    'investigations', 'much', 'novel', 'comparisons', 'survey', 'much', 'is', 'shown', 'uses', 'therefore', 'discusses', 'the', 'much', 'original', 'that',
    'shown', 'much', 'shown', 'experimental', 'much', 'comparisons', 'case', 'been', 'shown', 'comparisons', 'we', 'much', 'result', 'shows', 'dealing', 'shown',
    'illustrate', 'techniques', 'deals', 'conclusion', 'corresponding', 'in', 'comparisons', 'much', 'also', 'shown', 'there', 'determine', 'my', 'there', 'shown',
    'conclusively', 'comparisons', 'much', 'concluding', 'comparisons', 'we', 'validating', 'proposes', 'been', 'address', 'shown', 'deals', 'there', 'comparisons',
    'much', 'shown', 'been', 'techniques', 'ourselves', 'shown', 'much', 'concluding', 'been', 'technique', 'myself', 'there', 'experimental', 'be', 'examining', 'be',
    'been', 'case', 'a', 'much', 'discusses', 'been', 'shown', 'proposing', 'we', 'been', 'discusses', 'been', 'shown', 'been', 'technique', 'also', 'much', 'there', 'been',
    'we', 'addresses', 'we', 'been', 'there', 'been', 'my', 'validates', 'been', 'there', 'shown', 'been', 'shown', 'there', 'much', 'been', 'much', 'been', 'addresses', 'has', 'been',
    'we', 'proposing', 'been', 'there', 'been', 'there', 'been', 'techniques', 'been', 'techniques', 'techniques', 'shown', 'comparisons', 'be', 'been', 'novel', 'been', 'shown', 'been',
    'validates', 'been', 'novel', 'shown', 'addresses', 'addresses', 'proposing', 'been', 'technique', 'shown', 'been', 'techniques', 'addresses', 'addresses', 'addresses', 'there', 'techniques',
    'been', 'techniques', 'abstract', 'article', 'paper', 'title', 'author', 'authors', 'keywords', 'summary', 'introduction',
    'conclusion', 'method', 'results', 'discussion', 'proceedings', 'journal', 'volume', 'issue', 'doi', 'published',
    'published', 'conference', 'university', 'copyright', 'rights', 'reserved', 'email', 'corresponding', 'address',
    'tel', 'fax', 'et', 'al', 'figure', 'figures', 'table', 'tables', 'figure', 'figures', 'table', 'tables',
    'data', 'methodology', 'methodologies', 'study', 'studies', 'experiment', 'experiments', 'research', 'result',
    'results', 'analysis', 'discussion', 'discussion', 'discussions', 'method', 'methods', 'materials', 'material',
    'abstract', 'article', 'paper', 'title', 'author', 'authors', 'keywords', 'summary', 'introduction',
    'conclusion', 'method', 'results', 'discussion', 'proceedings', 'journal', 'volume', 'issue', 'doi', 'published',
    'published', 'conference', 'university', 'copyright', 'rights', 'reserved', 'email', 'corresponding', 'address',
    'tel', 'fax', 'et', 'al', 'figure', 'figures', 'table', 'tables', 'figure', 'figures', 'table', 'tables',
    'data', 'methodology', 'methodologies', 'study', 'studies', 'experiment', 'experiments', 'research', 'result',
    'results', 'analysis', 'discussion', 'discussion', 'discussions', 'method', 'methods', 'materials', 'material',
    'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'discussion', 'discussions', 'discuss',
    'discussed', 'discussing', 'approach', 'approaches', 'used', 'using', 'use', 'based', 'case', 'study', 'study',
    'studies', 'investigate', 'investigation', 'evaluate', 'evaluation', 'assess', 'assessment', 'analyze', 'analysis',
    'propose', 'proposed', 'method', 'methods', 'technique', 'techniques', 'novel', 'novelty', 'new', 'original', 'work',
    'researcher', 'researchers', 'work', 'works', 'demonstrate', 'demonstrates', 'demonstrated', 'experimental', 'study',
    'studies', 'survey', 'surveys', 'experiment', 'experiments', 'experimental', 'validation', 'validate', 'validates',
    'validating', 'simulation', 'simulations', 'simulation', 'result', 'results', 'model', 'models', 'modeling', 'modelled',
    'model', 'analysis', 'analyses', 'analyze', 'analysed', 'analysing', 'method', 'methods', 'methodology', 'compare', 'compares',
    'comparing', 'comparison', 'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating', 'conclusion', 'conclusions',
    'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'documented', 'documenting', 'report',
    'reports', 'reported', 'reporting', 'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'study', 'studies',
    'show', 'shows', 'showing', 'illustrate', 'illustrates', 'illustrating', 'present', 'presents', 'presenting', 'determine',
    'determines', 'determining', 'discuss', 'discusses', 'discussing', 'propose', 'proposes', 'proposing', 'address', 'addresses',
    'addressing', 'solve', 'solves', 'solving', 'solved', 'approach', 'approaches', 'novel', 'novelty', 'new', 'original', 'use',
    'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques', 'experimental', 'experiment',
    'experiments', 'survey', 'surveys', 'simulation', 'simulations', 'model', 'models', 'modeling', 'validation', 'validate',
    'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating', 'conclusion',
    'conclusions', 'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'report', 'reports',
    'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates',
    'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining', 'discuss', 'discusses', 'discussing',
    'propose', 'proposes', 'proposing', 'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved', 'approach',
    'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods',
    'technique', 'techniques', 'experimental', 'experiment', 'experiments', 'survey', 'surveys', 'simulation', 'simulations', 'model',
    'models', 'modeling', 'validation', 'validate', 'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate',
    'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document',
    'documents', 'report', 'reports', 'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows', 'showing',
    'illustrate', 'illustrates', 'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining', 'discuss',
    'discusses', 'discussing', 'propose', 'proposes', 'proposing', 'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved',
    'approach', 'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods',
    'technique', 'techniques', 'experimental', 'experiment', 'experiments', 'survey', 'surveys', 'simulation', 'simulations', 'model', 'models',
    'modeling', 'validation', 'validate', 'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating',
    'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'report', 'reports', 'investigation',
    'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates', 'illustrating', 'present', 'presents',
    'presenting', 'determine', 'determines', 'determining', 'discuss', 'discusses', 'discussing', 'propose', 'proposes', 'proposing', 'address', 'addresses',
    'addressing', 'solve', 'solves', 'solving', 'solved', 'approach', 'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using', 'used', 'utilize',
    'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques', 'experimental', 'experiment', 'experiments', 'survey', 'surveys', 'simulation',
    'simulations', 'model', 'models', 'modeling', 'validation', 'validate', 'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate',
    'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'report',
    'reports', 'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates', 'illustrating',
    'present', 'presents', 'presenting', 'determine', 'determines', 'determining', 'discuss', 'discusses', 'discussing', 'propose', 'proposes', 'proposing',
    'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved', 'approach', 'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using',
    'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques', 'experimental', 'experiment', 'experiments', 'survey', 'surveys',
    'simulation', 'simulations', 'model', 'models', 'modeling', 'validation', 'validate', 'validates', 'validating', 'comparison', 'comparisons', 'evaluation',
    'evaluate', 'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents',
    'report', 'reports', 'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates',
    'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining', 'discuss', 'discusses', 'discussing', 'propose', 'proposes',
    'proposing', 'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved', 'approach', 'approaches', 'novel', 'novelty', 'new', 'original',
    'use', 'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques', 'experimental', 'experiment', 'experiments',
    'survey', 'surveys', 'simulation', 'simulations', 'model', 'models', 'modeling', 'validation', 'validate', 'validates', 'validating', 'comparison',
    'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude', 'concluding', 'conclusively', 'paper',
    'papers', 'document', 'documents', 'report', 'reports', 'investigation', 'investigations', 'study', 'studies', 'case', 'cases', 'show', 'shows',
    'showing', 'illustrate', 'illustrates', 'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining', 'discuss',
    'discusses', 'discussing', 'propose', 'proposes', 'proposing', 'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved', 'approach',
    'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques',
    'experimental', 'experiment', 'experiments', 'survey', 'surveys', 'simulation', 'simulations', 'model', 'models', 'modeling', 'validation', 'validate',
    'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude',
    'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'report', 'reports', 'investigation', 'investigations', 'study', 'studies', 'case',
    'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates', 'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining',
    'discuss', 'discusses', 'discussing', 'propose', 'proposes', 'proposing', 'address', 'addresses', 'addressing', 'solve', 'solves', 'solving', 'solved', 'approach',
    'approaches', 'novel', 'novelty', 'new', 'original', 'use', 'using', 'used', 'utilize', 'utilizes', 'utilizing', 'method', 'methods', 'technique', 'techniques',
    'experimental', 'experiment', 'experiments', 'survey', 'surveys', 'simulation', 'simulations', 'model', 'models', 'modeling', 'validation', 'validate',
    'validates', 'validating', 'comparison', 'comparisons', 'evaluation', 'evaluate', 'evaluates', 'evaluating', 'conclusion', 'conclusions', 'conclude',
    'concluding', 'conclusively', 'paper', 'papers', 'document', 'documents', 'report', 'reports', 'investigation', 'investigations', 'study', 'studies', 'case',
    'cases', 'show', 'shows', 'showing', 'illustrate', 'illustrates', 'illustrating', 'present', 'presents', 'presenting', 'determine', 'determines', 'determining',
    'discuss', 'discusses', 'discussing', 'propose']
common_stop_words = list(set(common_stop_words))
def remove_words(text):
    for word in common_stop_words:
        text = text.replace(f' {word} ', ' ')
    return text.strip()
test_processed['Text'] = test_processed['Text'].apply(remove_words)
print(test_processed)

In [None]:
from torch.utils.data import DataLoader
test_loader = DataLoader(test_processed, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
from torch import cuda

def predict(model, dataloader):
    model.eval()
    all_predictions = []
    #all_indices = []

    with torch.no_grad():
        for batch_index, data in tqdm(enumerate(dataloader)):
            ids = data['ids'].to(device, dtype=torch.long).int()
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)

            predictions = outputs
            all_predictions.append(predictions.cpu().numpy())

    all_predictions = np.vstack(all_predictions)
    return all_predictions

In [None]:
test_final = pd.DataFrame()
test_processed
test_final['text'] = test_processed['Text']
test_final

In [None]:
class PredictLabel1(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [None]:
testing_data = test_final#.reset_index(drop=True)

testing_set = PredictLabel1(testing_data, tokenizer, MAX_LEN)

In [None]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
unlabelled_predictions = predict(model, testing_loader)

In [None]:
sigm = unlabelled_predictions

In [None]:
sigm = 1 / (1 + np.exp(-np.array(sigm)))

In [None]:
final1 = sigm

In [None]:
final11 = final1 >= 0.608

In [None]:
final11 = final11 *1

In [None]:
final11 = pd.DataFrame(final11)

In [None]:
columns = pd.read_csv('/kaggle/input/data-columns/Processed_Data.csv')

In [None]:
final11.columns = columns.columns[4:]

In [None]:
final11['Id'] = test_processed['Id']

In [None]:
sample_cols = pd.read_csv('/kaggle/input/sample-columns/sample_submission.csv')

In [None]:
final11= final11[sample_cols.columns]

In [None]:
final11
#This is DistilBERT output

In [None]:
MAX_LEN = 350
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 25
LEARNING_RATE = 1e-05

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset1(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            
            return_tensors = 'pt'
        )
        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.linear1 = torch.nn.Linear(768, 512)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(512, 256)
        self.leaky_relu = torch.nn.LeakyReLU()
        self.linear3 = torch.nn.Linear(256, 64)
        self.tanh = torch.nn.Tanh()
        self.classifier = torch.nn.Linear(64, 57)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1.last_hidden_state
        pooler = hidden_state[:, 0]
        linear1_output = self.linear1(pooler)
        linear1_output = self.dropout(linear1_output)
        linear2_output = self.linear2(linear1_output)
        linear2_output = self.leaky_relu(linear2_output)
        linear3_output = self.linear3(linear2_output)
        linear3_output = self.leaky_relu(linear3_output)
        output = self.classifier(linear3_output)
        return output


In [None]:
model2 = RoBERTaClass()
model2.load_state_dict(torch.load('/kaggle/input/roberta/roberta_model_epoch_25.pt'))
model2 = model2.to(device)

In [None]:
import numpy as np
import pandas as pd
test_df = pd.read_csv('/kaggle/input/test-data1/test.csv')###INSERT TEST DATA HERE###
test_processed = pd.DataFrame()
test_processed["Id"] = test_df['Id']
test_df['Title'] = test_df['Title'].str.lower()
test_df['Abstract'] = test_df['Abstract'].str.lower()
test_processed['Text'] =test_df['Title'] + '. ' + test_df['Abstract']

In [None]:
from torch.utils.data import DataLoader

# Assuming you have a dataset for your unlabelled test data (test_dataset)
test_loader = DataLoader(test_processed, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
from torch import cuda

def predict1(model, dataloader):
    model2.eval()
    all_predictions = []
    #all_indices = []

    with torch.no_grad():
        for batch_index, data in tqdm(enumerate(dataloader)):
            ids = data['ids'].to(device, dtype=torch.long).int()
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)

            outputs = model2(ids, mask, token_type_ids)

            predictions = outputs  
            all_predictions.append(predictions.cpu().numpy())

    all_predictions = np.vstack(all_predictions)
    return all_predictions

In [None]:
class PredictLabel(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [None]:
test_final = pd.DataFrame()
test_final['text'] = test_processed['Text']
testing_data = test_final
testing_set = PredictLabel(testing_data, tokenizer, MAX_LEN)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
unlabelled_predictions = predict1(model2, testing_loader)
sigm = unlabelled_predictions
sigm = 1 / (1 + np.exp(-np.array(sigm)))
si = sigm
si = pd.DataFrame(si)
final11_2 = si

In [None]:
yetp= sigm
save = sigm

In [None]:
final11_2 = final11_2 >= 0.5694
final11_2 = final11_2 *1
columns = pd.read_csv('/kaggle/input/data-columns/Processed_Data.csv')

In [None]:
final11_2.columns = columns.columns[4:]
final11['Id'] = test_processed['Id']

In [None]:
sample_cols = pd.read_csv('/kaggle/input/sample-columns/sample_submission.csv')

In [None]:
final11_u = final11 + final11_2
final11_u = final11_u >= 1
final11_u = final11_u * 1
final11_u['Id'] =  test_processed['Id']
final11_u = final11_u[sample_cols.columns]
final11_u.to_csv('barak.csv', index = False)