In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import re


import torch
import torch.nn as nn
from torch.utils.data import Subset
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler,random_split

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, f1_score

import transformers
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW

import warnings
warnings.filterwarnings("ignore")

In [None]:
TOKENIZER_SCIBERT_PATH="/kaggle/input/tokenizerscibert/"
MODEL_SCIBERT_PATH="/kaggle/input/modelscibert/"
TEST_DATASET_PATH="/kaggle/input/test-csv/test.csv"
CUSTOM_MODEL_PATH="/kaggle/input/finetunemodel/model.bin"

In [None]:
df = pd.read_csv(TEST_DATASET_PATH)
df.head()

In [None]:
df['text'] = df['Title'] +" "+ df['Abstract']
del df['Title']
del df['Abstract']
df.head()

In [None]:
def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'' )
        
    text = [word.lower() for word in text.split()]
    
    text = " ".join(text) 
    
    return text

In [None]:
df['text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

In [None]:
MAX_LEN = 512
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SCIBERT_PATH)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_SCIBERT_PATH)
        self.fc = torch.nn.Linear(768,57)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output = self.fc(features)
        return output
  

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = BERTClass()
model_state_dict = torch.load(CUSTOM_MODEL_PATH, map_location=device)
model.load_state_dict(model_state_dict)
model.to(device)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df["text"]
        self.tokenizer = tokenizer
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }


In [None]:
test_dataset = BERTDataset(df, tokenizer, MAX_LEN)

In [None]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize lists to store predictions
predictions = []

# Set the model in evaluation mode
model.eval()

# Iterate through the test data and make predictions
with torch.no_grad():
    print(len(test_loader))
    for index,batch in enumerate(test_loader):
        input_ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)

        # Forward pass
        outputs = model(input_ids, mask = mask, token_type_ids=token_type_ids)

        # Assuming your model outputs a single value per sample (regression task)
        batch_predictions = outputs.squeeze().tolist()

        # Append batch predictions to the list
        predictions.extend(batch_predictions)
        if index% 50==0: 
            print(f"{index} completed")

# Create a new dataframe to store predictions
output_df = pd.DataFrame({'Prediction': predictions})

In [None]:
output_df.head()

In [None]:
replace_numbers = lambda x: [0 if i < -1.05 else 1 for i in x]

# Apply the lambda function to each element in the DataFrame
output_df1 = output_df.applymap(replace_numbers)
df1 = pd.concat([df['Id'],output_df1['Prediction']],axis = 1)

column_titles = ['cs.AI','cs.AR','cs.CE','cs.CL','cs.CR','cs.CV','cs.DB','cs.DC','cs.DM','cs.GT','cs.IR','cs.IT','cs.LG','cs.LO','cs.NI','cs.OS','cs.PL','cs.RO','cs.SD','cs.SE','econ.EM','econ.GN','econ.TH','eess.AS','eess.IV','eess.SP','math.AC','math.AP','math.AT','math.CO','math.CV','math.GR','math.IT','math.LO','math.NT','math.PR','math.QA','math.ST','q-bio.BM','q-bio.CB','q-bio.GN','q-bio.MN','q-bio.NC','q-bio.TO','q-fin.CP','q-fin.EC','q-fin.GN','q-fin.MF','q-fin.PM','q-fin.PR','q-fin.RM','q-fin.TR','stat.AP','stat.CO','stat.ME','stat.ML','stat.TH']
# Initialize an empty dictionary to store the new data
new_data = {}

# Add the 'ColumnA' data to the new data dictionary
new_data['Id'] = df['Id']

# Iterate through the list of column titles and the corresponding values from 'ColumnB'
for i, title in enumerate(column_titles):
    new_data[title] = output_df1['Prediction'].apply(lambda x: x[i])

# Create the new DataFrame
new_df = pd.DataFrame(new_data)

In [None]:
new_df.head(10)

In [None]:
df2 = pd.read_csv("../input/sample-csv/sample_submission.csv")

In [None]:
column_titles = df2.columns[0:58].tolist()

new_df = new_df[column_titles]
csv_file_path = 'siang.csv'  

# Save the DataFrame to a CSV file
new_df.to_csv(csv_file_path, index=False)