In [1]:
from torch.utils.data import Dataset, DataLoader,TensorDataset
import pandas as pd
import torch
import re
import numpy as np
import os
from torch.nn.utils.rnn import pad_sequence
import numpy as np
class dataset(Dataset):
    def __init__(self, Data_path):
        # We should use source id to classify the conservation
        df = pd.read_csv(Data_path, sep='\t')
        self.data = []
        self.mapping = {'MS':0,
                       'PH':1,
                       'AM':2,
                       'SF':3,
                       'SR':4,
                       'OTHER':5}
        self.GT = []
        text = []
        repeat = { 'AM': 7,
                   'SR': 6,
                   'SF': 4,
                   'MS': 4,
                   'PH': 4,
                 'OTHER':1}
        times = 0
        for _, row in df.iterrows():
            if('AM' in row['classes']):
                times = repeat['AM']
            elif('SR' in row['classes']):
                times = repeat['SR']
            elif('SF' in row['classes']):
                times = repeat['SF']
            elif('MS' in row['classes']):
                times = repeat['MS']
            elif('PH' in row['classes']):
                times = repeat['PH']
            else:
                times = repeat['OTHER']
            for i in range(times):
                text.append(row['utterance'])
                train_data = self.get_train_data(text)
                self.data.append(train_data)
                GT = np.zeros((6))
                if('MS' in row['classes']):
                    GT[0] = 1
                if('PH' in row['classes']):
                    GT[1] = 1
                if('AM' in row['classes']):
                    GT[2] = 1
                if('SF' in row['classes']):
                    GT[3] = 1
                if('SR' in row['classes']):
                    GT[4] = 1
                if('OTHER' in row['classes']):
                    GT[5] = 1
                self.GT.append(GT)
        
    def __len__(self):
        return len(self.data)
    def get_train_data(self,text):
        ans = ""
        if(len(text)<3):
            for i in text:
                ans += i
        else:
            for i in range(3):
                ans += text[-3+i]
        return ans 
    def __getitem__(self, idx):
        data,GT = self.data[idx],self.GT[idx]
        return data,GT

In [11]:
import requests
import argparse
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.tensorboard import SummaryWriter
from transformers import DebertaTokenizer, DebertaForSequenceClassification,RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer,AutoModelForSequenceClassification, AutoTokenizer,T5Tokenizer, T5ForConditionalGeneration


writer = SummaryWriter("/kaggle/working/logs")
num_classes = 6
num_epochs=5
batch_size=16
learning_rate=0.001
Name='baseline'
device='cuda'
model_path='t5-small' 
def train(train_path,eval_path):
    ##Step 1 construct training dataset
    eval_dataset = dataset(eval_path)
    train_dataset = dataset(train_path)
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
#     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#     tokenizer = RobertaTokenizer.from_pretrained(model_path)
#     tokenizer = DistilBertTokenizer.from_pretrained(model_path)
#     tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    train_tokens = tokenizer.batch_encode_plus(
                    train_dataset.data,
                    padding=True,
                    truncation=True,
                    return_tensors='pt',
    )
    test_tokens = tokenizer.batch_encode_plus(
        eval_dataset.data,
        padding=True,
        truncation=True,
        return_tensors='pt',
    )
    train_labels = torch.tensor(train_dataset.GT)
    test_labels = torch.tensor(eval_dataset.GT)
    # Create DataLoader
    train_data = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], train_labels)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_data = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], test_labels)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
#     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes).to(device)
#     model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=num_classes).to(device)
#     model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes).to(device)
#     model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=num_classes).to(device)
#     model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_classes).to(device)
    model = T5ForConditionalGeneration.from_pretrained(model_path, num_labels=num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), 
                                    lr=5e-05, # this learning rate is for bert model , )taken from huggingface website 
                                    eps=1e-08,
                                    weight_decay=0.01)
    loss_fn = torch.nn.CrossEntropyLoss()
    loss_min = np.inf
    # Fine-tune the model
    for epoch in tqdm(range(num_epochs)):
        model.train()
        loss_all = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
            loss = outputs.loss
            loss_all += loss
            loss.backward()
            optimizer.step()
        writer.add_scalar(f"Training_Loss/{Name}", loss_all/len(train_loader), epoch)
        model.eval()
        loss_all = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = batch
                outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
                loss = outputs.loss
                loss_all += loss
            writer.add_scalar(f"Validate_Loss/{Name}", loss_all/len(test_loader), epoch)
            print(loss_all/len(train_loader))
            if(loss_all/len(train_loader)<loss_min):
                loss_min = loss_all/len(train_loader)
                model.save_pretrained('/kaggle/working/model.pt')
                tokenizer.save_pretrained('/kaggle/working/tokenizer.pt')
def Show_proposition(train_path):
    data = pd.read_csv(train_path, sep='\t')
    GT = np.zeros(6)
    s = len(data)
    print(s)
    for index, row in data.iterrows():
        if('MS' in row['classes']):
            GT[0] += 1
        if('PH' in row['classes']):
            GT[1] += 1
        if('AM' in row['classes']):
            GT[2] += 1
        if('SF' in row['classes']):
            GT[3] += 1
        if('SR' in row['classes']):
            GT[4] += 1
        if('OTHER' in row['classes']):
            GT[5] += 1
    print(f'MS:{GT[0]/s}, PH:{GT[1]/s}, AM:{GT[2]/s}, SF:{GT[3]/s},SR:{GT[4]/s}, OTHER:{GT[5]/s}')

if __name__ == '__main__':
    mode='train'
    train_path='/kaggle/input/nlp-hw2/train.tsv'
    eval_path='/kaggle/input/nlp-hw2/val.tsv'
    if(mode == 'train'):
        print("Start to use train mode")
        train(train_path,eval_path)
    elif(mode == "analysis"):
        Show_proposition(train_path)

Start to use train mode


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

tensor(0.0804, device='cuda:0', dtype=torch.float64)


 40%|████      | 2/5 [02:43<04:05, 81.79s/it]

tensor(0.0903, device='cuda:0', dtype=torch.float64)


 60%|██████    | 3/5 [04:05<02:43, 81.63s/it]

tensor(0.1102, device='cuda:0', dtype=torch.float64)


 80%|████████  | 4/5 [05:26<01:21, 81.55s/it]

tensor(0.1103, device='cuda:0', dtype=torch.float64)


100%|██████████| 5/5 [06:48<00:00, 81.60s/it]

tensor(0.1314, device='cuda:0', dtype=torch.float64)





In [18]:
import torch
import torch.nn.functional as F
def metric(predict,GT):
    TP,FP,FN = np.zeros((6)),np.zeros((6)),np.zeros((6))
    for i in range(len(GT[0])):
        if(GT[0][i] == 0 and predict[i] == 1):
            FP[i] += 1
        elif(GT[0][i] == 1):
            if(predict[i] == 1):
                TP[i] += 1
            else:
                FN[i] += 1
    return TP,FP,FN
eval_path='/kaggle/input/nlp-hw2/val.tsv'
eval_dataset = dataset(eval_path)
# model = BertForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer = BertTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')

# model = DebertaForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer = DebertaTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
# model = XLNetForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer =  XLNetTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
# model = RobertaForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer =  RobertaTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
# model = DistilBertForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer =  DistilBertTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
test_tokens = tokenizer.batch_encode_plus(
        eval_dataset.data,
        padding=True,
        truncation=True,
        return_tensors='pt',
    )
test_labels = torch.tensor(eval_dataset.GT)
test_data = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], test_labels)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)
TP,FP,FN = np.zeros((6)),np.zeros((6)),np.zeros((6))
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
        outputs = outputs[0].detach().cpu()[0]
        result = np.zeros((6))
        probabilities = F.softmax(outputs, dim=0)
        result[probabilities>0.5] = 1
        a,b,c = metric(result,labels)
        TP += a
        FP += b
        FN += c
    print(TP)
    print(FP)
    print(FN)
    F1 = np.zeros(6)
    for i in range(len(TP)):
        precision = (TP[i])/(TP[i]+FP[i])
        recall = (TP[i])/(TP[i]+FN[i])
        print(precision,recall)
        F1[i] = (2*precision*recall)/(precision+recall)

    print(sum(F1)/len(F1))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[220.  94. 123. 141. 146. 160.]
[125.  44.  16.  42.  16.  52.]
[ 32.  88.  38.  43. 118.  78.]
0.6376811594202898 0.873015873015873
0.6811594202898551 0.5164835164835165
0.8848920863309353 0.7639751552795031
0.7704918032786885 0.7663043478260869
0.9012345679012346 0.553030303030303
0.7547169811320755 0.6722689075630253
0.7182446527556045


In [17]:
import pandas as pd
class test_dataset(Dataset):
    def __init__(self, Data_path):
        # We should use source id to classify the conservation
        df = pd.read_csv(Data_path, sep='\t')
        self.data = []
        self.mapping = {'MS':0,
                       'PH':1,
                       'AM':2,
                       'SF':3,
                       'SR':4,
                       'OTHER':5}
        self.GT = []
        text = []
        for _, row in df.iterrows():
            text.append(row['utterance'])
            train_data = self.get_train_data(text)
            self.data.append(train_data)
            GT = np.zeros((6))
            self.GT.append(GT)
        
    def __len__(self):
        return len(self.data)
    def get_train_data(self,text):
        ans = ""
        if(len(text)<3):
            for i in text:
                ans += i
        else:
            for i in range(3):
                ans += text[-3+i]
        return ans 
    def __getitem__(self, idx):
        data,GT = self.data[idx],self.GT[idx]
        return data,GT

eval_path='/kaggle/input/nlp-hw2/test.tsv'
file_id = pd.read_csv('/kaggle/input/nlp-hw2/test.tsv',sep='\t')
eval_dataset = test_dataset(eval_path)
# model = BertForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
# tokenizer = BertTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/model.pt').to(device)
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/tokenizer.pt')
test_tokens = tokenizer.batch_encode_plus(
        eval_dataset.data,
        padding=True,
        truncation=True,
        return_tensors='pt',
    )
test_labels = torch.tensor(eval_dataset.GT)
test_data = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], test_labels)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)
data = {
    'id': [],
    'AM': [],
    'MS': [],
    'OTHER': [],
    'PH': [],
    'SF': [],
    'SR': [],
}
# if('MS' in row['classes']):
#                 GT[0] = 1
#             if('PH' in row['classes']):
#                 GT[1] = 1
#             if('AM' in row['classes']):
#                 GT[2] = 1
#             if('SF' in row['classes']):
#                 GT[3] = 1
#             if('SR' in row['classes']):
#                 GT[4] = 1
#             if('OTHER' in row['classes']):
#                 GT[5] = 1
id = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
        outputs = outputs[0].detach().cpu()[0]
        result = np.zeros((6))
        probabilities = F.softmax(outputs, dim=0)
        result[probabilities>0.5] = 1
        data['id'].append(file_id['id'][id]) 
        if(result[0] == 1):
            data['MS'].append(1)
        else:
            data['MS'].append(0)
        if(result[1] == 1):
            data['PH'].append(1)
        else:
            data['PH'].append(0)
        if(result[2] == 1):
            data['AM'].append(1)
        else:
            data['AM'].append(0)
        if(result[3] == 1):
            data['SF'].append(1)
        else:
            data['SF'].append(0)
        if(result[4] == 1):
            data['SR'].append(1)
        else:
            data['SR'].append(0)
        if(result[5] == 1):
            data['OTHER'].append(1)
        else:
            data['OTHER'].append(0)
        id += 1
print(data)
df = pd.DataFrame(data)
df.to_csv('submit.csv', index=False)
            
        

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'id': [1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 11