In [None]:
!pip install pytorch_lightning
!pip install transformers
!pip install sentencepiece

In [None]:
!pip install wandb -qqq

In [None]:
import wandb
wandb.login()

In [1]:
import json
import torch
import torch.nn as nn
import os, glob, re
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import (T5ForConditionalGeneration,
                          AdamW,
                          T5TokenizerFast as token)

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
pl.seed_everything(13)
print(torch.__version__)
PATH = '/content/drive/MyDrive/Coleridge_Initiative/input'

Global seed set to 13


1.8.1+cu101


In [2]:
df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/v6_data_qa.csv')
df[df.answer.isna()]

Unnamed: 0,question,text,answer,answer_start,answer_end,origin_text,origin_answer_start,origin_answer_end,len text,id


In [5]:
# wandb.init(project="ci", config={
#     "learning_rate": 0.0001,
#     "architecture": "T5",
#     'model': 't5-base',
#     "dataset": "Coleridge Initiative ",
#     'tex_max_len': 396,
#     'asw_max_len': 44,
#     'batch_size' : BATCH_SIZE,
#     'epoch':N_EPOCHS
# })
# config = wandb.config

BATCH = 6
EPOCHS =1

config={
    "learning_rate": 0.0001,
    "architecture": "T5",
    'model': 't5-base',
    "dataset": "Coleridge Initiative ",
    'tex_max_len': 396,
    'asw_max_len': 44,
    'batch_size' : BATCH,
    'epoch':EPOCHS,
    'device': 'gpu'
}

In [6]:
class CI_Dataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: token,
        config
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.tex_max_len = config['tex_max_len']
        self.asw_max_len = config['asw_max_len']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index:int):
        txt = self.data.iloc[index]

        encode_txt = token(
            txt['question'],
            txt['text'],
            max_length = self.tex_max_len, 
            padding= 'max_length',
            truncation = 'only_second',
            return_attention_mask = True,
            add_special_tokens =True,
            return_tensors = 'pt'
            )
        
        encode_asw = token( 
            txt['answer'],
            max_length = self.asw_max_len,
            padding= 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens =True,
            return_tensors = 'pt'
            )
        labels = encode_asw['input_ids']
        labels[labels == 0] = -100

        return dict(
            question=txt['question'],
            text=txt['text'],
            answer=txt['answer'],
            input_ids=encode_txt['input_ids'].flatten(),
            attention_mask=encode_txt['attention_mask'].flatten(),
            labels = labels.flatten()
            )
        
class CI(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(config['model'], return_dict = True)

    def forward(self, input_ids, attention_mask, labels):
        out = self.model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    labels = labels
                    )
        return out.loss, out.logits

In [7]:
MODEL = config['model']
token = token.from_pretrained(MODEL)
exampe_dataset = CI_Dataset(df, token, config)

for data in exampe_dataset:
    print(data['question'])
    print(data['text'])
    print(data['answer'])

    print(data['input_ids'][:10])
    print(data['attention_mask'][:10])    
    break

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…


the impact of evaluation use on accounting programs performance an exploratory study
the quality of educational programs has been an object of debate and research around the world initiatives such as the program for international student assessment pisa and the trends in international mathematics and science study timss show that international organizations such as the organization for economic co operation and development oecd and the international association for the evaluation of educational achievement iea are trying to verify whether schools are adequately preparing their students by comparing their performances aiming to highlight the strengths and weaknesses among the educational systems of different countries higher education has also been the object of quality evaluations around the world ursin huusko aittola kiviniemi muhonen van kemenade pupius hardjono governmental and non governmental organizations have developed ways to certify institutional quality through evaluation or

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def loss_func(pred, tr):
    """
    pred: out model
    tr: target
    """
    return nn.BCEWithLogitsLoss()(pred, tr)

def train(model, data_loader, optimizer, config, scheduler = None):
    model.train()
    for txt in enumerate(dataloader):
        input_ids=txt['input_ids'].to(config['device'])
        mask=txt['attention_mask'].to(config['device'])
        labels = txt['labels'].to(config['device'])
        optimizer.zero_grad()
        loss, out = model(input_ids, mask, labels)
        loss_2 = loss_func(out, labels)
        loss.backward()
        if scheduler is not None:
            scheduler.step()

def valid(model, data_loader,config):
    model.eval()
    losses = []
    targets = []
    for txt in enumerate(dataloader):
        input_ids=txt['input_ids'].to(config['device'])
        mask=txt['attention_mask'].to(config['device'])
        labels = txt['labels'].to(device) 
        loss, out = model(input_ids, mask, labels)
        loss_2 = loss_func(out, labels)
        losses.append(loss.cpu().detach().numpy())
        targets.append(labels.cpu().detach().numpy())

    return np.vstack(losses), np.vstack(targets)


def run(config):
    MODEL = config['model']

    df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/v6_data_qa.csv')
    df_small = df.drop_duplicates(subset=['question']).reset_index(drop=True)
    train_df, val_df = train_test_split(df_small.head(2000), random_state = 13, test_size = 0.1)
    tr = train_df.reset_index(drop=True)
    vl = val_df.reset_index(drop=True)

    token = token.from_pretrained(MODEL)

    tr_dataset = CI_Dataset(tr,token,config)
    vl_dataset = CI_Dataset(vl,token,config)

    tr_loader = DataLoader(tr_dataset, batch_size = config['batch'], shuffle = True, num_workers = 4)
    vl_loader = DataLoader(vl_dataset, batch_size = 1, num_workers = 4)

    model = CI(config).to_device()
    optimizer = AdamW(model.parameters(), lr = config['learning_rate'])
    for e in config['epoch']:
        train(model, tr_loader, optimizer, config)
        l, t = valid(model, vl_loader, config)
        print('loss:', np.mean(l))

In [None]:
run(config)