In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/baml_contest

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/baml_contest


In [2]:
%pip install sentencepiece



In [3]:
# %pip install transformers

In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random
from tqdm.auto import tqdm, trange
import os
from sklearn.model_selection import train_test_split

In [5]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")
set_seed()

Random seed set as 42


In [6]:
df = pd.read_excel('../dataset_making/df_result_final.xlsx')
df = df[df.answer!='-']
df.columns = ['X', 'Y']
df = df[~df['X'].isna()]
df = df[~df['Y'].isna()]
print(df.shape[0])
df.head()

9906


Unnamed: 0,X,Y
0,"–•–æ—Ä–æ—à–∞—è –õ–µ—Ä–æ—á–∫–∞ —Ç—ã –∂–µ–Ω—â–∏–Ω–∞, —Ö–æ–∑—è—é—à–∫–∞ –∏ –∑–∞–±–æ—Ç–ª...",–¢–µ–∫—Å—Ç –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç —Å–æ–±–æ–π –Ω–∞–±–æ—Ä –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤ –ø–æ...
1,–õ–∞–ø–∫–æ–∑–∞–≤—Ä–∏–∫ –≤—Å—Ç—Ä–∞–∏–≤–∞–µ—Ç —É–∑–µ–ª –≤ –ø—É—Å—Ç—É—é —è—á–µ–π–∫—É –∏ ...,"–í –ø–æ—Å—Ç–µ –æ–±—Å—É–∂–¥–∞—é—Ç—Å—è —Ä–∞–∑–ª–∏—á–Ω—ã–µ –∞—Å–ø–µ–∫—Ç—ã –∏–≥—Ä—ã ""–õ–∞..."
2,–∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ. —ç–º–ø–∏—Ä–∏—á–µ—Å–∫–∏–º –º–µ—Ç–æ–¥–æ–º –Ω–∞–¥–æ —ç—Ç–æ –ø—Ä–æ–≤–µ...,–ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏ –æ–±—Å—É–∂–¥–∞—é—Ç —Ä–∞—Ü–∏–æ–Ω –ø—Ä–∞–≤–∏–ª—å–Ω–æ–≥–æ –ø–∏—Ç–∞–Ω...
3,"–Æ—Ä–∏–π —Ö–æ—Ä–æ—à–∏–π –≤—ã —á–µ–ª–æ–≤–µ–∫, –¥–∞ –ø–æ–º–æ–∂–µ—Ç –≤–∞–º –ë–æ–≥ .‚ù§...","–í —ç—Ç–æ–º –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏ –æ–±—Å—É–∂–¥–∞–µ—Ç—Å—è, —á—Ç–æ —Ä–æ–ª–∏–∫–∏ –Æ—Ä–∏..."
4,–í–∞—É... –ø—Ä–µ–∫—Ä–∞—Å–Ω–æ ‚ù§ –ö–∞–∫ –≤—Å–µ–≥–¥–∞ –∏–¥–µ–∞–ª—å–Ω–æ‚ù§ –ê—Ç–º–æ—Å—Ñ...,–í –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è—Ö –∫ –ø–æ—Å—Ç—É –∞–≤—Ç–æ—Ä –≤—ã—Ä–∞–∂–∞–µ—Ç —Å–≤–æ—é —Ä–∞–¥...


In [7]:
repl = pd.read_excel('../src/replacements_long.xlsx', header=None)
repl.columns = ['r']
repl = list(repl.r.values)

In [8]:
def repls(x):

    if x[0]=='"' and x[-1]=='"':
        x = x[1:-1]

    very_pop_intro = '–ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏ –≤–∞—Ä—å–∏—Ä—É—é—Ç—Å—è –æ—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã—Ö –¥–æ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã—Ö. '
    if very_pop_intro in x[0:len(very_pop_intro)] and len(x)*2 > len(very_pop_intro):
        x = x.replace(very_pop_intro, '')

    for _ in range(10):
        for el in repl:
            if el in x[0:len(el)]:
                x = x.replace(el, '')
    return x.capitalize()


df['Y'] = df['Y'].apply(lambda x: repls(x))

In [9]:
df['X'] = df['X'].apply(lambda x: x[0:1000])
df = df[df['Y']!='']
df.shape[0]

9906

In [10]:
df_train, df_test = train_test_split(df.dropna(), test_size=0.15, random_state=1)
pairs = df_train[['X', 'Y']].values.tolist()

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

raw_model = 'd0rj/rut5-base-summ'
model = T5ForConditionalGeneration.from_pretrained(raw_model).cuda()
tokenizer = T5Tokenizer.from_pretrained(raw_model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

# new_model_name = '../src/my_custom_model_4'
# model = T5ForConditionalGeneration.from_pretrained(new_model_name, local_files_only=True).cuda()
# # model = T5ForConditionalGeneration.from_pretrained(new_model_name, local_files_only=True)
# tokenizer = T5Tokenizer.from_pretrained(new_model_name, local_files_only=True)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
batch_size = 4 # 8
report_steps = 50
epochs = 5

# batch_size = 8
# report_steps = 5
# epochs = 1

In [23]:
# pairs = pairs[0:20]

In [24]:
model.train()
losses = []
best_test_loss = 9999999
breaker = 0
stop_breaker = 100
for epoch in range(epochs):
    print('EPOCH', epoch)
    random.shuffle(pairs)
    for i in trange(0, int(len(pairs) / batch_size)):
        model.train()
        batch = pairs[i * batch_size: (i + 1) * batch_size]
        x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
        y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
        y.input_ids[y.input_ids == 0] = -100
        loss = model(
            input_ids=x.input_ids,
            attention_mask=x.attention_mask,
            labels=y.input_ids,
            decoder_attention_mask=y.attention_mask,
            return_dict=True
        ).loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())

        if i % report_steps == 0:

            model.eval()
            losses_test = []
            pairs_test = df_test[['X', 'Y']].values.tolist()
            random.shuffle(pairs_test)
            for j in range(0, int(len(pairs_test) / batch_size)):
                batch = pairs_test[j * batch_size: (j + 1) * batch_size]
                x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                y.input_ids[y.input_ids == 0] = -100
                loss = model(
                    input_ids=x.input_ids,
                    attention_mask=x.attention_mask,
                    labels=y.input_ids,
                    decoder_attention_mask=y.attention_mask,
                    return_dict=True
                ).loss
                losses_test.append(loss.item())

            print('step', i, 'loss:', np.mean(losses[-report_steps:]), 'test_loss:', np.mean(losses_test))

            test_loss = np.mean(losses_test)
            if test_loss < best_test_loss:
                best_test_loss = test_loss
                print('loss —Ç–µ—Å—Ç–∞ —Å–Ω–∏–∑–∏–ª—Å—è, —Å–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å')
                new_model_name = 'src/my_custom_model_4'
                model.save_pretrained(new_model_name)
                tokenizer.save_pretrained(new_model_name)
            else:
                breaker += 1

            if breaker == stop_breaker:
                raise Exception('–ó–∞–∫–∞–Ω—á–∏–≤–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ')

EPOCH 0


  0%|          | 0/2105 [00:00<?, ?it/s]

step 0 loss: 2.777674674987793 test_loss: 2.251300893703882
loss —Ç–µ—Å—Ç–∞ —Å–Ω–∏–∑–∏–ª—Å—è, —Å–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å
step 50 loss: 2.595400447845459 test_loss: 2.2488900426263116
loss —Ç–µ—Å—Ç–∞ —Å–Ω–∏–∑–∏–ª—Å—è, —Å–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å
step 100 loss: 2.551977863311768 test_loss: 2.2440938628266123
loss —Ç–µ—Å—Ç–∞ —Å–Ω–∏–∑–∏–ª—Å—è, —Å–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å
step 150 loss: 2.5316172289848327 test_loss: 2.250102803713549
step 200 loss: 2.4950809979438784 test_loss: 2.250645996425351
step 250 loss: 2.562086486816406 test_loss: 2.249828809676466
step 300 loss: 2.533814187049866 test_loss: 2.2497527541497324
step 350 loss: 2.5412562251091004 test_loss: 2.24868938222407


OutOfMemoryError: ignored

In [13]:
model.eval()

def answer(x, **kwargs):
    inputs = tokenizer(x, return_tensors='pt').to(model.device)
    with torch.no_grad():
        hypotheses = model.generate(**inputs, **kwargs)
    return tokenizer.decode(hypotheses[0], skip_special_tokens=True)

In [14]:
sample = df_train.sample(50, random_state=42)
for i, row in sample.iterrows():
    print(row.X)
    print('real:', row.Y)
    print('model: ', repls(answer(row.X)))
    print('---')

–Ø–¥–æ–≥–∞–¥—ã–≤–∞—é—Å—å –ò–Ω—Ç–µ—Ä–µ—Å–Ω–æ, –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æüòä
–ë—É–¥—É —Ä–∞–¥–∞ –ø–µ—Ä–µ—Å–µ—á—å—Å—è) –≠—Ç–æ —Ç–∞ —Ñ–µ–µ—á–∫–∞ –í–∏–Ω–∫—Å —á—Ç–æ –ª–∏? –î–∞ —É–∂–µ –≤—Å—ë –ø–æ–Ω—è—Ç–Ω–æ –ø–æ —ç—Ç–æ–º—É –∫–æ–ª–ª–∞–∂—É, –ø—Ö–ø—Ö –ù–æ —Å–ø–æ–π–ª–µ—Ä–∏—Ç—å –Ω–µ –±—É–¥—É, –Ω–∞ –≤—Å—è–∫–∏–π ü§´ –ê –ø–æ—á–µ–º—É –≤—Å–µ —Ä–µ—à–∏–ª–∏ —Å–ø–æ–π–ª–µ—Ä–Ω—É—Ç—å –∫–æ–ª–ª–∞–∂–∞–º–∏ –∏–∑ –ø–∏–Ω—Ç–µ—Ä–µ—Å—Ç–∞ üòêü§® –Ω–µ —É–∂-—Ç–æ —Ç–µ—Ç—è –∏–∑ –≥—Ä—É–ø–ø—ã —Ç–∞—Ç—Ç—É?) —ç—Ç–æ –ª–µ–¥–∏?????
real: –ö–æ–ª–ª–∞–∂–∏ –∏–∑ –ø–∏–Ω—Ç–µ—Ä–µ—Å—Ç–∞, –∫–æ—Ç–æ—Ä—ã–µ –º–æ–≥—É—Ç –±—ã—Ç—å —Å–ø–æ–π–ª–µ—Ä–æ–º –∫ –Ω–æ–≤–æ–º—É —Å–µ–∑–æ–Ω—É —Å–µ—Ä–∏–∞–ª–∞ "–≤–∏–Ω–∫—Å". –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–∏ –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞—é—Ç, —á—Ç–æ –æ–¥–∏–Ω –∏–∑ –∫–æ–ª–ª–∞–∂–µ–π –º–æ–∂–µ—Ç –±—ã—Ç—å —Å–≤—è–∑–∞–Ω —Å –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–º –∏–∑ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–≥–æ —Å–µ—Ä–∏–∞–ª–∞.
model:  –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–∏ –æ–±—Å—É–∂–¥–∞—é—Ç –∫–æ–ª–ª–∞–∂, –∫–æ—Ç–æ—Ä—ã–π –æ–Ω–∏ —Ö–æ—Ç–µ–ª–∏ –±—ã —Å–ø–æ–π–ª–µ—Ä–∏—Ç—å, –∏ –≤—ã—Å–∫–∞–

In [15]:
sample = df_test.sample(50, random_state=42)
for i, row in sample.iterrows():
    print(row.X)
    print('real:', row.Y)
    print('model: ', repls(answer(row.X)))
    print('---')

–ü–∞—Ä–Ω–∏, –ø–∏—à–∏—Ç–µ –º–Ω–µ –≤ –ª–∏—á–∫—É, –∏ —è –≤—ã—à–ª—é –≤–∞–º —Å–≤–æ–∏ —Ñ–æ—Ç–æ –∞–±—Å–æ–ª—é—Ç–Ω–æ –±–µ—Å–ø–ª–∞—Ç–Ω–æ. –ñ–¥—É —Ç–µ–±—è —Å–æ–ª–Ω—ã—à–∫–æ. –ó–Ω–∞–µ–º, –æ—á–µ–Ω—å —Ö–æ—á–µ—Ç—Å—è –≤—ã—Å–∫–∞–∑–∞—Ç—å—Å—è, –Ω–æ —Å–Ω–∞—á–∞–ª–∞ –ø—Ä–æ—á–∏—Ç–∞–π—Ç–µ <a href="https://telegra.ph/Pravila-kommentariev-i-chata-Super-03-07">–ø—Ä–∞–≤–∏–ª–∞ —á–∞—Ç–∞</a> Super üòâ
–ü—Ä–∞–≤–∏–ª–∞ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤ –∏ —á–∞—Ç–∞ Super
–ó–∞–ø—Ä–µ—â–µ–Ω–æ: ‚Äî –õ—é–±—ã–µ –≤–∏–¥—ã —Ä–µ–∫–ª–∞–º—ã (–≤ —Ç–æ–º —á–∏—Å–ª–µ –≤ –æ–ø–∏—Å–∞–Ω–∏–∏ –∞–∫–∫–∞—É–Ω—Ç–∞) –∏ –ø–æ–ø—Ä–æ—à–∞–π–Ω–∏—á–µ—Å—Ç–≤–æ; ‚Äî –ü–æ—Ä–Ω–æ–≥—Ä–∞—Ñ–∏—è –∏ —Å—Å—ã–ª–∫–∏ –Ω–∞ –Ω–µ–µ; ‚Äî –ù–µ—Ü–µ–Ω–∑—É—Ä–Ω—ã–µ –≤—ã—Å–∫–∞–∑—ã–≤–∞–Ω–∏—è; ‚Äî –°—Å—ã–ª–∫–∏ –Ω–∞ —Å—Ç–æ—Ä–æ–Ω–Ω–∏–µ —Å–∞–π—Ç—ã –∏ —Ç–µ–ª–µ–≥—Ä–∞–º-–∫–∞–Ω–∞–ª—ã;¬† ‚Äî –õ—é–±—ã–µ –≤–∏–¥—ã —à–æ–∫-–∫–æ–Ω—Ç–µ–Ω—Ç–∞: —Ç—Ä—É–ø—ã, –Ω–∞—Å–∏–ª–∏–µ –∏ —Ç–¥; ‚Äî –û—Å–∫–æ—Ä–±–ª–µ–Ω–∏—è —É—á–∞—Å—Ç–Ω–∏–∫–æ–≤ —á–∞—Ç–∞ –∏ –ª—é–±—ã—Ö –ª–∏—Ü (–ø–æ —Ä–∞—Å–æ–≤–æ–º—É, –≥–µ–Ω–¥–µ—Ä–Ω–æ–º—É –ø—Ä–∏–

–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø—Ä–µ–¥–∏–∫—Ç–æ–≤ –ø–æ —Ç—Ä–µ–π–Ω—É –∏ —Ç–µ—Å—Ç—É –≤ —Ñ–∞–π–ª—ã, —á—Ç–æ–±—ã –ø–æ—Ç–æ–º –≤—ã–±—Ä–∞—Ç—å –≤–≤–æ–¥–Ω—ã–µ –∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏

In [None]:
answers = []
sample = df_test.copy()
for i, row in sample.iterrows():
    # print(repls(answer(row.X)))
    answers.append(repls(answer(row.X)))
    if i % 100==0:
      print(i)


3200
4200
100
1100
8800
2000
7100
5500
1000
3400
1300
2700
7300
1800
3500
5800
800


In [None]:
df[['Y']].to_excel('clean_df.xlsx', index=False)

In [None]:
# sample = df_train.sample(50, random_state=42)
# for i, row in sample.iterrows():
#     print(row.X)
#     print('real:', row.Y)
#     print('model: ', repls(repls(answer(row.X))))
#     print('---')

import math
N = df_train.shape[0]
BATCH_SIZE = 4
steps = math.ceil(N / BATCH_SIZE)
result = []
SEQ_LEN = 1500

for i in range(steps):
    if i != steps:
        batch = list(df_train[i*BATCH_SIZE:(i+1)*BATCH_SIZE]['X'].values)
    else:
        batch = list(df_train[i*BATCH_SIZE:]['X'].values)

    input_ids = tokenizer(batch, return_tensors='pt', truncation=True,
                          max_length=SEQ_LEN, padding=True).input_ids.to(device)
    outputs = model.generate(input_ids)
    for el in outputs:
        summary = repls(tokenizer.decode(el, skip_special_tokens=True))
        result.append(summary)

    if i % 500 == 0:
        print(i)


0
500
1000
1500
2000


KeyboardInterrupt: ignored

In [None]:
with open('result.txt', 'w') as fp:
    for item in result:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')


# with open('answers.txt', 'w') as fp:
#     for item in answers:
#         # write each item on a new line
#         fp.write("%s\n" % item)
#     print('Done')

Done


In [None]:
# new_model_name = 'my_custom_model'
# model.save_pretrained(new_model_name)
# tokenizer.save_pretrained(new_model_name)

In [None]:
breaker

30

In [None]:
100 - 13 - 46 - 41

0