In [None]:
# all the import
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
from collections import defaultdict

In [None]:
# all the constant
train_dir = "data/preprocessed/big/train.csv"
eval_dir = "data/preprocessed/big/dev.csv"

model_path = 'bert-base-uncased'
batch_size = 8
num_classes = 5
epoch = 10
learning_rate = 1e-5

random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df_train = pd.read_csv(train_dir)
df_train['sentiment'] = df_train['sentiment'].map(lambda x: x-1)

In [None]:
df_eval = pd.read_csv(eval_dir)
df_eval['sentiment'] = df_eval['sentiment'].map(lambda x: x-1)

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
max_length = 64

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, texts, sentiments, tokenizer, max_len):
        self.texts = texts
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        sentiment = int(self.sentiments[item])
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_len, 
            return_token_type_ids=False, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt',
        )
        return {
            'text': text, 
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment': torch.tensor(sentiment, dtype=torch.long)
        }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TwitterDataset(
        texts=df.text.to_numpy(),
        sentiments=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(
        ds, 
        batch_size=batch_size,
        num_workers=0,
        shuffle=True,
    )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, max_length, batch_size)
eval_data_loader = create_data_loader(df_eval, tokenizer, max_length, batch_size)

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False,
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
model = SentimentClassifier(num_classes)
model = model.to(device)


In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
total_steps = len(train_data_loader) * epoch

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps,
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
    model, 
    data_loader,
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    n_examples
):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['sentiment'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_epoch(
    model, 
    data_loader,
    loss_fn, 
    device, 
    n_examples
):
    model = model.eval()

    losses = []
    correct_predictions = 0

    preds_list = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['sentiment'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            preds_list.append(preds)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    print(preds_list)

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

for e in range(epoch):
  print(f'Epoch {e + 1}/{epoch}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')

  eval_acc, eval_loss = eval_epoch(
    model,
    eval_data_loader,
    loss_fn, 
    device, 
    len(df_eval)
  )
  print(f'Eval loss {eval_loss} accuracy {eval_acc}')
  print()

In [None]:
# save
torch.save(model.state_dict(), 'output/test/model_state.bin')

In [None]:
# load
model.load_state_dict(torch.load('output/test/model_state.bin'))

In [None]:
# test
test_dir = "data/preprocessed/small/test.csv"

In [None]:
df_test = pd.read_csv(test_dir)
df_test.head()

In [None]:
class TwitterDataset_test(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_len, 
            return_token_type_ids=False, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt',
        )
        return {
            'text': text, 
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [None]:
def create_data_loader_test(df, tokenizer, max_len, batch_size):
    ds = TwitterDataset_test(
        texts=df.text.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(
        ds, 
        batch_size=batch_size,
        num_workers=0,
        shuffle=False,
    )

In [None]:
test_data_loader = create_data_loader_test(df_test, tokenizer, max_length, batch_size)

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
  
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds)

    predictions = torch.stack(predictions).cpu()
    return predictions

In [None]:
test_preds = get_predictions(model, test_data_loader)
test_preds

In [None]:
array = np.array(test_preds)
df_preds = pd.DataFrame(array)

import csv
with open("output/test/submit.csv", 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id','sentiment'])
    index = 0
    for d in df_preds[0]:
        writer.writerow((index,d+1))
        index += 1

In [26]:
import csv
import difflib
# 根据计算出的submit.csv，在origin.csv中找到对应label并计算acc得分
# 返回acc和正确label列表
def cal_accuracy(sub_file, test_file, ori_file):
    acc = 0
    preds = []
    labels = []
    with open(ori_file, 'r', encoding='utf-8') as o:
        reader_o = csv.reader(o)
        f1 = open(sub_file, 'r')
        fp_s = csv.reader(f1)
        f2 = open(test_file, 'r')
        fp_t = csv.reader(f2)
        next(fp_s)  # 跳过标题
        next(fp_t)
        while True:
            try:
                print('\n')
                s = next(fp_s)
                preds.append(s[1])  # 预测的类别
                t = next(fp_t)
                text = t[1]  # 评价原文
                o.seek(0,0)
                reader_o = csv.reader(o)
                for line in reader_o:
                    if difflib.SequenceMatcher(None,line[10],text).quick_ratio() > 0.9:
                        print(line[10])
                        print(difflib.SequenceMatcher(None,line[10],text).quick_ratio())
                        labels.append(line[5])  # 真实标签
                        break
            except StopIteration:
                break
        o.close()
        f1.close()
        f2.close()
        # 计算acc
        for i in range(len(preds)):
            if (preds[i] == labels[i]):
                acc += 1
        acc = acc / len(preds)
        return acc, preds, labels
                    

In [27]:
acc, preds, labels = cal_accuracy("data/release/tmp_sub.csv","data/release/tmp_test.csv","data/release/origin.csv")



Two places I'd invest all my money if I could: 3D printing and Self-driving cars!!!
1.0


Awesome! Google driverless cars will help the blind travel more often; https://t.co/QWuXR0FrBpv
1.0


Autonomous vehicles could reduce traffic fatalities by 90%...I'm in!
1.0


Really good presentation from Jan Becker on Bosch's automated vehicle research. #AutoAuto check it out
1.0


Ford just revealed it's Automated Ford Fusion Hybrid Vehicle. Pretty amazing. #fordtrends @ Ford Test̢_ http://t.co/7axya8ogIW
0.9727626459143969


So yeah, just throwing this out there again. Would totally be down to beta test an autonomous car.
1.0


@TeslaMotors Musk reluctant to partner with Apple, Google, but an Android controlled, autonomous smart-car would be awesome!!
1.0


Finished SF&gt;LA drive. Now it̢s LA&gt;OC in rush hour for a meeting. I can̢t wait for an autonomous google car.
0.9173553719008265


The #Google autonomous car paid a visit to Nvidia HQ. Pretty cool technology, but as a person who̢_ 

In [28]:
print(acc)
print(preds)
print(labels)

0.2
['1', '2', '3', '4', '5', '5', '5', '2', '2', '3', '3', '3', '3', '3', '1', '4', '4', '4', '5', '1']
['5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5']
