In [13]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# 設定文件路徑
train_file_path = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Assignments\HW1_Relation Extraction from Natural Language using PyTorch\hw1_train-1.csv"

# 讀取訓練資料
train_df = pd.read_csv(train_file_path)

# 檢查並處理空值，將空值設為空字符串
train_df['CORE RELATIONS'] = train_df['CORE RELATIONS'].fillna('')

# 將每個關係轉換為列表形式，並檢查是否為字符串
train_df['CORE RELATIONS'] = train_df['CORE RELATIONS'].apply(lambda x: x.split() if isinstance(x, str) else [])

# 檢查前幾行數據是否正確
print(train_df.head())



   ID                                         UTTERANCES  \
0   0               who plays luke on star wars new hope   
1   1                     show credits for the godfather   
2   2             who was the main actor in the exorcist   
3   3  find the female actress from the movie she's t...   
4   4                    who played dory on finding nemo   

                                     CORE RELATIONS  
0  [movie.starring.actor, movie.starring.character]  
1                            [movie.starring.actor]  
2                            [movie.starring.actor]  
3              [movie.starring.actor, actor.gender]  
4  [movie.starring.actor, movie.starring.character]  


In [17]:
# 將文本轉換為 PyTorch 張量 (這裡可以使用詞嵌入方法)
class RelationDataset(Dataset):
    def __init__(self, dataframe, mlb):
        self.utterances = dataframe['UTTERANCE'].values
        self.labels = mlb.transform(dataframe['CORE RELATIONS'])
    
    def __len__(self):
        return len(self.utterances)
    
    def __getitem__(self, idx):
        utterance = self.utterances[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return utterance, label

# 使用 MultiLabelBinarizer 進行多標籤編碼
mlb = MultiLabelBinarizer()
mlb.fit(train_df['CORE RELATIONS'])

# 切分數據集
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# 創建 PyTorch 的 Dataset
train_dataset = RelationDataset(train_data, mlb)
val_dataset = RelationDataset(val_data, mlb)

# 創建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

KeyError: 'UTTERANCE'