# Import Libraries

In [1]:
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, logging, random, time
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from researchers_code.metrics import *

  from .autonotebook import tqdm as notebook_tqdm


# Import Files

In [2]:
column_names = ["Sentence", "Ent_1", "Ent_2", "Label"]

# Define file paths
test_file = 'datasets/test_with_scobes.txt'
valid_file = 'datasets/valid_with_scobes.txt'
train_file = 'datasets/train_with_scobes.txt'

# Load the datasets
test_data = pd.read_csv(test_file, sep='\t', names=column_names)
valid_data = pd.read_csv(valid_file, sep='\t', names=column_names)
train_data = pd.read_csv(train_file, sep='\t', names=column_names)

In [3]:
test_data.head()

Unnamed: 0,Sentence,Ent_1,Ent_2,Label
0,[Bush II] — released a statement opposing [Don...,Bush II,Donald Trump,3
1,after the [Dallas] murders — by blaming “syste...,Dallas,White Americans,3
2,[Zappos] CEO [Tony Hsieh] has been praised for...,Zappos,Tony Hsieh,1
3,But [Shaffer] and other experts faulted [Clint...,Shaffer,Clinton,3
4,"Kulitta, music composition software written by...",Donya Quick,Johann Sebastian Bach,0


There are five **labels:**
- 0 - neutral
- 1 - positive (p → q)
- 2 - positive (q → p)
- 3 - negative (p → q)
- 4 - negative (q → p)

# Applying the Model DSE2QA of the Researchers 

In [7]:
from researchers_code.dse2qa import main  # Correct import for main function

# Simulate argparse by creating an Args class
class Args:
    def __init__(self, input_type, resample, max_epoch, batch_size, random_seed, pretrain_type, temperature):
        self.input_type = input_type # Set input_type to ("T(emplate)" or "P(seudo)")
        self.resample = resample # Set resample to ("none", "up", "down")
        self.max_epoch = max_epoch
        self.batch_size = batch_size
        self.random_seed = random_seed
        self.pretrain_type = pretrain_type  # Set pretrain_type to "roberta" or "spanbert"
        self.temperature = temperature # needed otherwise doesnt work

# Manually create an args object
args = Args(input_type="T", resample="none", max_epoch=3, batch_size=40, random_seed=20180422, pretrain_type="roberta", temperature = 1.0)  

# Set random seeds for reproducibility
torch.manual_seed(args.random_seed)
random.seed(args.random_seed)
torch.manual_seed(args.random_seed)
torch.cuda.manual_seed_all(args.random_seed)
np.random.seed(args.random_seed)
os.environ['PYTHONHASHSEED'] = str(args.random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define the appropriate train_path based on the resample strategy
if args.resample == "none":
    train_path = "dataset/train.txt"
elif args.resample == "up":
    train_path = "dataset/train_over.txt"
elif args.resample == "down":
    train_path = "dataset/train_under.txt"
else:
    raise TypeError("Invalid resample option")

# Create a unique experiment ID based on parameters
exp_id = f"DSE2QA_{args.input_type}_{args.resample}_{args.max_epoch}_{args.batch_size}_{args.random_seed}"

# Ensure the 'out' directory exists
os.makedirs("out", exist_ok=True)

# Set up logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(exp_id)
f_handler = logging.FileHandler(f"out/{exp_id}.txt", mode="w")
f_handler.setLevel(logging.INFO)
logger.addHandler(f_handler)

# Call the main function with the args object
main(args, train_path, logger, exp_id)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: linear(): argument 'input' (position 1) must be Tensor, not str