In [1]:
import torch

from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
ru_gpt = GPT2LMHeadModel.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", output_attentions=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ru_gpt.to(device)

In [18]:
start = tokenizer.encode(
    "Мой стек: ",
    return_tensors="pt"
)

result = ru_gpt.generate(
    start.to(device),
    max_length=50,
    top_p=0.8,
    early_stopping=True,
    no_repeat_ngram_size=2,
)

In [None]:
print(tokenizer.decode(result.cpu().flatten().numpy(), skip_special_tokens=True))

In [24]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps, model = None):
  if model is None:
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)


    model.save_pretrained(output_dir)
  else:
    model_path = '/content/drive/MyDrive/case2/results'
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model.save_pretrained(output_dir)

  model = model.to(device)
  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs)
  trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )

  trainer.train()
  trainer.save_model()

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# не нужно, уже есть txt file
import json
import os
def get_text():
    cur_dir = os.getcwd()
    file_name = cur_dir + '/case_2_data_for_members.json'
    #file_name = '/content/drive/MyDrive/case2/case_2_data_for_members.json'
    with open(file_name, 'r') as file:
        data = json.load(file)

    vacancies = []

    for vac in data:
        vacancies.append(vac['vacancy']['description'])


    resumes = []
    for vac in data:
        for f_r in vac['failed_resumes']:
            if 'experienceItem' in f_r:
                for job in f_r['experienceItem']:
                    if job['description'] is not None:
                        resumes.append(job['description'])


        for c_r in vac['confirmed_resumes']:
            if 'experienceItem' in c_r:
                for job in c_r['experienceItem']:
                    if job['description'] is not None:
                        resumes.append(job['description'])

    dataset = resumes + vacancies
    return dataset
dataset = get_text()
print(len(dataset))
text = '.'.join(dataset)



FileNotFoundError: [Errno 2] No such file or directory: '/content/case_2_data_for_members.json'

In [25]:
train_file_path = "/content/drive/MyDrive/case2/data.txt"
#model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/case2/results'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 500

In [29]:
#для обучения обязателен торч 1 11 0
!pip install torch==1.11.0



In [31]:
!pip install accelerate>=0.21.0

In [33]:
!pip install accelerate -U



In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    model = None
)

In [23]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = '/content/drive/MyDrive/case2/results'
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=10,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [13]:
sequence = "Мой стек: python, c++"
max_len = 130
generate_text(sequence, max_len)

Мой стек: python, c++, css (с бухгал функциональными пользователями, так же разработчиками, что программ отправки документации), п


In [43]:
path = '/content/drive/MyDrive/case2/results'

model = load_model(path)
tokenizer = load_tokenizer(path)
word_embeddings = model.transformer.wte.weight

In [None]:
model

In [None]:
#хорошо работает только для слов, не предложений
def get_sequence_embeddings(sentence):
    tokens = tokenizer(sentence, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**tokens)

    logits = outputs['logits']
    past_key_values = outputs['past_key_values']

    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    embedding_layer = model.transformer.wte

    weighted_embeddings = torch.matmul(probabilities, embedding_layer.weight)

    last = weighted_embeddings[-1, -1, :] # специально последнее только берется
                                            # можно брать среднее, но тогда всегда очень близко к 1

    return last

sequence1 = "Это последовательность"
embeddings1 = get_sequence_embeddings(sequence1)
print(embeddings1.shape)
sequence2 = "Это последовательность уже побольше вот"
embeddings2 = get_sequence_embeddings(sequence2)
print(embeddings2.shape)

In [51]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def get_cosine_sim(first, second):
  emb1 = get_sequence_embeddings(first).detach().numpy().reshape(1, -1)
  emb2 = get_sequence_embeddings(second).detach().numpy().reshape(1, -1)

  sim = cosine_similarity(emb1, emb2)[0][0]


  return sim



In [55]:
s1 = 'Необходимо знание python, SQL, машинное обучение'
s2 = 'Необходимо знание python'
s3 = 'Работать с большими данными'
s4 = 'Работал с большими данными и разробатывал модели машинного обучения на python'
s5 = 'English sentence'
s6 = 'Да'
s7 = 'Python'
s8 = 'C++'
s9 = 'Язык'
s10 = 'Нос'

print(get_cosine_sim(s1, s2))
print(get_cosine_sim(s3, s4))
print(get_cosine_sim(s1, s4))
print(get_cosine_sim(s7, s8))
print(get_cosine_sim(s7, s9))
print(get_cosine_sim(s7, s10))

0.91665053
0.84231114
0.96071494
0.8607019
0.5465056
0.10261044
