In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install symspellpy
!pip install spacy
!python -m spacy download en_core_web_sm
#!pip install emot
#pip install contractions
#!pip -q install evaluate
#!pip install googletrans==4.0.0-rc1

# Preprocessing

In [None]:
import tensorflow as tf
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from symspellpy import SymSpell, Verbosity
#from emot.emo_unicode import UNICODE_EMOJI
#from emot.emo_unicode import EMOTICONS_EMO
import nltk
import spacy
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# 런타임 확인
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

In [232]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/test.csv')

In [None]:
train.head()

### preprocessing1

In [237]:
lemma = WordNetLemmatizer()

def clean_text(tweet):
    tweets = " ".join(filter(lambda x: x[0]!= '@' , tweet.split()))
    tweets = re.sub('[^a-zA-Z]', ' ', tweets)
    tweets = tweets.lower()
    tweets = tweets.split()
    tweets = [word for word in tweets if not word in set(stopwords.words('english'))]
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = " ".join(tweets)
    return tweets

In [238]:
train['text'] = train.text.apply(clean_text)
test['text'] = test.text.apply(clean_text)

In [None]:
train.head()

In [None]:
test.head()

In [207]:
def extract_hashtag(tweet):
    tweets = " ".join(filter(lambda x: x[0]== '#', tweet.split()))
    tweets = re.sub('[^a-zA-Z]',' ',  tweets)
    tweets = tweets.lower()
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = "".join(tweets)
    return tweets

train['word_with_hashtag'] = train.text.apply(extract_hashtag)
train.head()

In [None]:
print(train.loc[11920].text)
print(train.loc[11920].clean_text)

In [None]:
train = train.drop(["tweet","word_with_hashtag"],axis=1)

In [None]:
racist_sexist_hashtag = FreqDist(list(" ".join(train[train['sentiment']==1]['word_with_hashtag']).split())).most_common(15)
racist_sexist_data = pd.DataFrame(racist_sexist_hashtag, columns=['words', 'frequency'])
plt.figure(figsize=(20,20))
sns.barplot(x='words',y="frequency" ,data=racist_sexist_data,color="deepskyblue")

plt.title('Racist and Sexist Words with Hashtags\n',fontsize=20,color="darkorange")
plt.xlabel("\nWords",fontsize=20,color="darkorange")
plt.ylabel("Frequency\n",fontsize=20,color="darkorange")
plt.tight_layout(pad=0)
plt.show()

### preprocessing2

In [178]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Remove punctuations, links, mentions and \r\n new line characters
def clean_symbols(tweet):
    # Remove @ and # symbols from the tweet
    new_tweet = re.sub(r'[@#]', '', tweet)
    new_tweet = re.sub(r'’', '\'', new_tweet)
    new_tweet = re.sub(r'\d', '', new_tweet)
    new_tweet = re.sub(r'-', ' ', new_tweet)
    return new_tweet.strip()

def strip_all_entities(text):
    text = re.sub(r'\B@\w+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    #text = re.sub(r'(?:\@|https?\://)\S+', '', text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

#축약어 사전
contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "can not",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "that'll": "that will",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    "yr": "year"
}

'''
def expand_contractions(text, contraction_mapping=contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        if first_char.isupper():
            expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
            expanded_contraction = expanded_contraction.capitalize()
        else:
            expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text
'''

def expand_contractions(text):
    # 단어 별로 축약된 형태를 정규식을 사용하여 대응 규칙을 적용
    contraction_patterns = [(rf"{key}", value) for key, value in contraction_mapping.items()]

    for pattern, replacement in contraction_patterns:
        text = re.sub(pattern, replacement, text)

    return text

#Lemmatization1
lemmatizer = WordNetLemmatizer()

def lemmatize_text1(text):
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    text = ' '.join(words)
    return text

#Lemmatization2
nlp = spacy.load("en_core_web_sm")

# 단어를 기본형으로 변환하는 함수
def lemmatize_text2(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_words)

#Spelling Correction
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def correct_spelling_symspell(text):
    words = [
        sym_spell.lookup(
            word,
            Verbosity.CLOSEST,
            max_edit_distance=2,
            include_unknown=True
            )[0].term
        for word in text.split()]
    text = " ".join(words)
    return text

#Correcting Compound Words

bigram_path = "/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/frequency_bigramdictionary_en_243_342.txt"
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

def correct_compound(text):
    words = [
        sym_spell.lookup_compound(
            word,
            max_edit_distance=2
            )[0].term
        for word in text.split()]
    text = " ".join(words)
    #text = re.sub(r'\b(a|of)\b', '', text) #불용어 a와 of 제거
    return text

In [179]:
train["text"] = train["text"].apply(clean_symbols)
test["text"] = test["text"].apply(clean_symbols)

train["text"] = train["text"].apply(strip_all_entities)
test["text"] = test["text"].apply(strip_all_entities)

train["text"] = train["text"].apply(filter_chars)
test["text"] = test["text"].apply(filter_chars)

train["text"] = train["text"].apply(remove_mult_spaces)
test["text"] = test["text"].apply(remove_mult_spaces)

#Lower Casing
train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()

train["text"] = train["text"].apply(expand_contractions)
test["text"] = test["text"].apply(expand_contractions)

#lemmatization
train["text"] = train["text"].apply(lemmatize_text2)
test["text"] = test["text"].apply(lemmatize_text2)

#Lower Casing
train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()

#spelling correction
train["text"] = train["text"].apply(correct_spelling_symspell)
test["text"] = test["text"].apply(correct_spelling_symspell)

#correcting compound words
train["text"] = train["text"].apply(correct_compound)
test["text"] = test["text"].apply(correct_compound)

In [180]:
# 두 칸 이상의 빈칸을 한칸의 빈칸으로 변환
train['text'] = train['text'].str.replace("\s+", " ", regex=True)
test['text'] = test['text'].str.replace("\s+", " ", regex=True)

In [None]:
t = train.loc[20000].text

print(t)
print()
print(1, clean_symbols(t))
print(2, strip_all_entities(t))
print(3, filter_chars(t))
print(4, remove_mult_spaces(t))
print(5, expand_contractions(t))
print(6, lemmatize_text1(t))
print(7, lemmatize_text2(t))
print(8, correct_spelling_symspell(t))
print(9, correct_compound(t))
print()

t = correct_compound(correct_spelling_symspell(lemmatize_text2(expand_contractions(remove_mult_spaces(filter_chars(strip_all_entities(clean_symbols(t))))))))
print('final:', t)
#train.head()

In [None]:
train.shape

In [None]:
train.head()

In [241]:
train.to_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/trans_pre_train2.csv', index=False)
test.to_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/trans_pre_test2.csv', index=False)

# Robert 1 (best: 83%)

In [244]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/trans_pre_train2.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/trans_pre_test2.csv')

data = data.dropna()
test['text'].fillna('text', inplace=True)

In [None]:
texts = data['text'].tolist()
texts

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm


texts = data['text'].tolist()
sentiments = data['sentiment'].tolist()


# ROBERTa 토크나이저 초기화
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 데이터 토큰화
encoded_texts = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']

# 라벨 매핑 (e.g., {'positive': 0, 'neutral': 1, 'negative': 2})
label_mapping = {label: i for i, label in enumerate(set(sentiments))}
labels = [label_mapping[s] for s in sentiments]

# 텐서로 변환
labels = torch.tensor(labels)

# 데이터셋 분할
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=32)

# 모델 초기화
num_labels = len(label_mapping)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 옵티마이저 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

# 훈련 함수
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(iterator)

# 검증 함수
def evaluate(model, iterator):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    return total_loss / len(iterator)

# 훈련 및 검증
EPOCHS = 3
for epoch in range(EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    val_loss = evaluate(model, val_dataloader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


In [None]:
# 테스트 데이터 토큰화
test_texts = test['text'].tolist()
encoded_test_texts = tokenizer(test_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
input_ids_test = encoded_test_texts['input_ids']
attention_masks_test = encoded_test_texts['attention_mask']

# 텐서로 변환
input_ids_test = torch.tensor(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)

# 배치 크기 설정
batch_size = 32

# 테스트 데이터 로더 생성
test_dataset = TensorDataset(input_ids_test, attention_masks_test)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

# 모델 평가 함수
def predict_sentiment(model, iterator):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(iterator):
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            predictions.extend(predicted.tolist())
    return predictions

# 감정 예측
test_predictions = predict_sentiment(model, test_dataloader)

# 예측 결과를 데이터프레임에 추가
#test['predicted_sentiment'] = test_predictions

In [195]:
submission = pd.DataFrame({"id": test["id"], "sentiment": test_predictions})

# 결과를 CSV 파일로 저장
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/Yongin-city/csv파일/roberta_preprocessing.csv', index=False)

# Robert 2(73%)

In [None]:
train = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/trans_pre_train.csv')
test = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/trans_pre_test.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
import torch.nn.functional as F

class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        label = self.data.iloc[idx]["sentiment"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

def train_epoch(model, data_loader, optimizer, scheduler, device, loss_function):
    model = model.train()
    losses = []
    for batch in tqdm(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_function(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()

    return np.mean(losses)

def evaluate(model, data_loader, device, loss_function):
    model = model.eval()
    labels_all = []
    predictions_all = []
    losses = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_function(logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(logits, dim=1)
            labels_all.extend(labels.cpu().numpy())
            predictions_all.extend(preds.cpu().numpy())

    loss_mean = np.mean(losses)
    f1_macro = f1_score(labels_all, predictions_all, average='macro')
    return loss_mean, f1_macro


# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
batch_size = 8


In [None]:
train_data = train
train_data, val_data = train_test_split(train_data, test_size=0.1, stratify=train_data["sentiment"], random_state=42)

train_dataset = NewsDataset(train_data, tokenizer, max_length=512)
val_dataset = NewsDataset(val_data, tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

total_steps = len(train_loader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)


class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction="mean"):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == "mean":
            return torch.mean(F_loss)
        elif self.reduction == "sum":
            return torch.sum(F_loss)
        else:
            return F_loss

# Instantiate the FocalLoss
loss_function = FocalLoss(alpha=1.0, gamma=2.0, reduction="mean").to(device)

best_f1_macro = 0

num_epochs = 1
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print("-" * 10)

    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, loss_function)
    print(f"Train loss: {train_loss}")

    val_loss, val_f1_macro = evaluate(model, val_loader, device, loss_function)
    print(f"Validation Loss: {val_loss}")
    print(f"Validation F1 Macro: {val_f1_macro}")

    if val_f1_macro > best_f1_macro:
        print("F1 Macro score improved. Saving the model.")
        best_f1_macro = val_f1_macro
        torch.save(model.state_dict(), "best_roberta_large_model.bin")

print("Training completed.")

def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

# Load the best model
model.load_state_dict(torch.load("best_roberta_large_model.bin"))

class TestNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }



In [None]:

def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

# Load the best model
model.load_state_dict(torch.load("best_roberta_large_model.bin"))

class TestNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }



In [None]:
# Load and preprocess the test dataset
test_data = pd.read_csv("test.csv")
test_dataset = TestNewsDataset(test_data, tokenizer, max_length=512)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Make predictions on the test dataset
predictions = predict(model, test_loader, device)

# Create submission DataFrame and save it as a CSV file
submission = pd.DataFrame({"id": test_data["id"], "label": predictions})
submission.to_csv("submission.csv", index=False)

print("Inference completed. The submission.csv file has been generated.")

In [None]:
train = train.dropna()

nan_count = train.isna().sum().sum()

nan_count

# Robert 3

In [None]:
#general purpose packages
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel
from transformers import XLNetTokenizerFast
from transformers import TFXLNetModel
from transformers import XLMTokenizer
from transformers import TFXLMModel



#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/trans_pre_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/용인시 SW 해커톤/trans_pre_test.csv')

In [None]:
df.head()

In [None]:
from transformers import RobertaConfig
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, AdamWeightDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(list(df['text']), list(df['sentiment']), test_size=0.2, random_state=42)

# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64, return_tensors="pt", padding='max_length')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64, return_tensors="pt", padding='max_length')

# Convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Create a configuration for the model
config = RobertaConfig.from_pretrained('roberta-base', num_labels=3)
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3

# Initialize the model with the configuration
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', config=config)

# Compile the model using AdamW
optimizer = AdamWeightDecay(learning_rate=5e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=1)

# Train the model
model.fit(train_dataset.shuffle(1000).batch(32), validation_data=val_dataset.batch(32), epochs=8, batch_size=32, callbacks=[early_stopping, lr_scheduler])
