In [None]:
!pip install pytorch_lightning
!pip install transformers
!pip install sentencepiece

In [None]:
!pip install wandb -qqq

In [None]:
import wandb
wandb.login()

In [None]:
import json
import torch
from tqdm import tqdm
import torch.nn as nn
import os, glob, re
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import (T5ForConditionalGeneration,
                          AdamW,
                          T5TokenizerFast as token)

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
pl.seed_everything(13)
print(torch.__version__)
PATH = '/content/drive/MyDrive/Coleridge_Initiative/input'

Global seed set to 13


1.8.1+cu101


## Model

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/v6_data_qa.csv')
df[df.answer.isna()]

Unnamed: 0,question,text,answer,answer_start,answer_end,origin_text,origin_answer_start,origin_answer_end,len text,id


In [None]:
# wandb.init(project="ci", config={
#     "learning_rate": 0.0001,
#     "architecture": "T5",
#     'model': 't5-base',
#     "dataset": "Coleridge Initiative ",
#     'tex_max_len': 396,
#     'asw_max_len': 44,
#     'batch_size' : BATCH_SIZE,
#     'epoch':N_EPOCHS
# })
# config = wandb.config

BATCH = 6
EPOCHS =1

config={
    "learning_rate": 0.0001,
    "architecture": "T5",
    'model': 't5-base',
    "dataset": "Coleridge Initiative ",
    'tex_max_len': 396,
    'asw_max_len': 44,
    'batch_size' : BATCH,
    'epoch':EPOCHS,
    'device': 'cuda'
}

In [None]:
class CI_Dataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: token,
        config
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.tex_max_len = config['tex_max_len']
        self.asw_max_len = config['asw_max_len']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index:int):
        txt = self.data.iloc[index]

        encode_txt = token(
            txt['question'],
            txt['text'],
            max_length = self.tex_max_len, 
            padding= 'max_length',
            truncation = 'only_second',
            return_attention_mask = True,
            add_special_tokens =True,
            return_tensors = 'pt'
            )
        
        encode_asw = token( 
            txt['answer'],
            max_length = self.asw_max_len,
            padding= 'max_length',
            truncation = True,
            return_attention_mask = True,
            add_special_tokens =True,
            return_tensors = 'pt'
            )
        labels = encode_asw['input_ids']
        labels[labels == 0] = -100

        return dict(
            question=txt['question'],
            text=txt['text'],
            answer=txt['answer'],
            input_ids=encode_txt['input_ids'].flatten(),
            attention_mask=encode_txt['attention_mask'].flatten(),
            labels = labels.flatten()
            )
        
class CI(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(config['model'], return_dict = True)

    def forward(self, input_ids, attention_mask, labels):
        out = self.model(input_ids = input_ids,
                    attention_mask = attention_mask,
                    labels = labels
                    )
        return out.loss, out.logits

In [None]:
MODEL = config['model']
token = token.from_pretrained(MODEL)
exampe_dataset = CI_Dataset(df, token, config)

for data in exampe_dataset:
    print(data['question'])
    print(data['text'])
    print(data['answer'])

    print(data['input_ids'][:10])
    print(data['attention_mask'][:10])    
    break

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…


the impact of evaluation use on accounting programs performance an exploratory study
the quality of educational programs has been an object of debate and research around the world initiatives such as the program for international student assessment pisa and the trends in international mathematics and science study timss show that international organizations such as the organization for economic co operation and development oecd and the international association for the evaluation of educational achievement iea are trying to verify whether schools are adequately preparing their students by comparing their performances aiming to highlight the strengths and weaknesses among the educational systems of different countries higher education has also been the object of quality evaluations around the world ursin huusko aittola kiviniemi muhonen van kemenade pupius hardjono governmental and non governmental organizations have developed ways to certify institutional quality through evaluation or

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# def loss_func(pred, target):
#     tmp = []
#     token = token.from_pretrained(MODEL)
#     for i in range(len(t)):
        
#         tt = np.where(t[i] != -100, t[i], 0)
#         decode = token.decode(tt, skip_special_tokens=True, clean_up_tokenization_spaces=True)          
#         tmp.append(jaccard(''.join(decode), target))
#     return tmp

def loss_func(pred, target, config):
    tmp = []
    MODEL = config['model']
    tokenizer = token.from_pretrained(MODEL)
    for i in range(len(pred)):
        out_decode = tokenizer.decode(np.argmax(pred[i], axis = 1),
                                  skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=True)       
        tmp.append(jaccard(''.join(out_decode), target[i]))
    return tmp


def train(model, data_loader, optimizer, config, scheduler = None):
    model.train()
    for _, txt in enumerate(data_loader):        
        input_ids=txt['input_ids'].to(config['device'])
        mask=txt['attention_mask'].to(config['device'])
        labels = txt['labels'].to(config['device'])
        optimizer.zero_grad()
        loss, out = model(input_ids, mask, labels)
        loss.backward()
        if scheduler is not None:
            scheduler.step()

def valid(model, data_loader,config):
    model.eval()
    losses = []
    asw = []
    loss_2 = []
    outs = []
    for _, txt in enumerate(data_loader):
        input_ids=txt['input_ids'].to(config['device'])
        mask=txt['attention_mask'].to(config['device'])
        labels = txt['labels'].to(config['device']) 
        answer=txt['answer']

        loss, out = model(input_ids, mask, labels)
        out = out.cpu().detach().numpy()
        loss2 = loss_func(out, answer, config)
        losses.append(loss.cpu().detach().numpy())
        loss_2.append(loss2)
        asw.append(answer)
        outs.append(out)

    return np.vstack(losses), np.vstack(asw), np.vstack(loss_2), np.vstack(outs)


def run(config):
    MODEL = config['model']

    df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/v6_data_qa.csv')
    df_small = df.drop_duplicates(subset=['question']).reset_index(drop=True)
    train_df, val_df = train_test_split(df_small.head(100), random_state = 13, test_size = 0.1)
    tr = train_df.reset_index(drop=True)
    vl = val_df.reset_index(drop=True)

    tokenizer = token.from_pretrained(MODEL)

    tr_dataset = CI_Dataset(tr,tokenizer,config)
    vl_dataset = CI_Dataset(vl,tokenizer,config)

    tr_loader = DataLoader(tr_dataset, batch_size = config['batch_size'], shuffle = True, num_workers = 4)
    vl_loader = DataLoader(vl_dataset, batch_size = 1, num_workers = 4)

    model = CI(config).to(config['device'])
    optimizer = AdamW(model.parameters(), lr = config['learning_rate'])
    bar =  tqdm(range(config['epoch']))
    for e in bar:
          
        train(model, tr_loader, optimizer, config)
        l, t, j, o = valid(model, vl_loader, config)
        print('Loss:', np.mean(l))
        bar.set_description(f'Jaccard Loss: {np.mean(j)}, Epoch: {e +1}')

    # https://pytorch.org/tutorials/beginner/saving_loading_models.html
    torch.save(model.state_dict(), '/content/drive/MyDrive/Coleridge_Initiative/model/model_check_predict.pth')
    torch.cuda.empty_cache()    
    return l,t, j, o

In [None]:
l, t, j, o = run(config)

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




Jaccard Loss: 0.42023809523809524, Epoch: 1: 100%|██████████| 1/1 [00:34<00:00, 34.17s/it]

Loss: 6.3800325





## Submit

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/train.csv')
submission_df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/sample_submission.csv', index_col=0)
df_test_head = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/df_test_publications.csv')
datasets_titles = [x.lower() for x in set(train['dataset_title'].unique()).union(set(train['dataset_label'].unique()))]


In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def make_interval(start:int, txt:str, interval: int)->str:
    """
    start: int return re.search() count latter
    txt: str text
    interval: int len text 

    return: str text
    """
    words = txt.split()
    len_interval = interval * 2
    start = len(txt[:start].split())    
    if (start - interval) > 0:
        start = start - interval
        if start + len_interval < len(words):
            words = words[start: start + len_interval]
        else:              
            words = words[start: len(words)]
    else:      
        words = words[0: len_interval]
    return ' '.join(words)


def make_pred(question:str, pre_model, tokenizer)->str:
    encode_test = tokenizer(question['question'],
                        question['text'],    
                        max_length = 396, 
                        padding= 'max_length',
                        truncation = 'only_second',
                        return_attention_mask = True,
                        add_special_tokens =True,
                        return_tensors = 'pt'
                        )
    gen_ids = pre_model.model.generate(
        input_ids = encode_test['input_ids'],
        attention_mask = encode_test['attention_mask'],
        num_beams = 5,
        no_repeat_ngram_size = 1,
        # Если установлено значение int> 0, все нграммы этого размера могут встречаться только один раз.
              
        num_return_sequences = 1, 
        # количество независимо вычисленных возвращаемых  последовательностей для каждого элемента в пакете.
    
        do_sample=True,
        # использовать ли выборку; в противном случае используйте жадное декодирование.        
        top_k=0,
        # Количество лексем словаря с наивысшей вероятностью, которое нужно сохранить для фильтрации top-k        

#         temperature=0.7,  
        # Значение, используемое для модуля вероятностей следующего токена.
        top_p=0.92,       
        max_length = 8,
        # максимальная длина генерируемой последовательности.
        repetition_penalty = 2.5,
        # Параметр штрафа за повторение. 1.0 означает отсутствие штрафа    
        length_penalty =0.5,
        
#         означает отсутствие штрафа. Установите значения <1.0, чтобы стимулировать модель
#         генерировать более короткие последовательности, на значение> 1.0, 
#         чтобы стимулировать модель к созданию более длинных последовательностей.
               
        early_stopping = True,
        use_cache = True

        
    )


    decode = [
              tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
              for ids in gen_ids
              ]
    return ' '.join(decode)

df_test_head = pd.read_csv('../input/model-ci/df_test_publications.csv')
datasets_titles = [x.lower() for x in set(train['dataset_title'].unique()).union(set(train['dataset_label'].unique()))]

In [None]:
model = CI(config)
model.load_state_dict(torch.load('/content/drive/MyDrive/Coleridge_Initiative/model/model_check_predict.pth'))
model.eval()

labels = []
MODEL = config['model']
tokenizer = token.from_pretrained('../input/model-ci/token')
for index in submission_df.index:
    publication_text = df_test_head[df_test_head['pub_id'] == index].text.str.cat(sep='\n').lower()
    label = []
    for dataset_title in datasets_titles:
        dt =clean_text(dataset_title)
        pt = clean_text(publication_text)

        if dt in pt:
            start = re.search(r'\b%s\b' % dt, pt).start()
            txt = make_interval(start, pt, 396)
            
            data = {'question':dt,
                    'text': txt,
                    }            
            y_ = make_pred(data, model, tokenizer)
            y_ = clean_text(y_)
        
#             if y_.split()[:3] not in label:
            label.append(y_)

    labels.append('|'.join(label))

submission_df['PredictionString'] = labels
submission_df.to_csv('submission.csv')
submission_df