In [None]:
SEED = 34
#maximum number of words in output text
MAX_LEN = 70


In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import codecs
from transformers import  GPTNeoForCausalLM, GPT2Tokenizer, set_seed, GPT2Config
import time
import torch
from rouge import Rouge 
from tqdm import trange
from bleu import list_bleu

set_seed(SEED)
torch.manual_seed(SEED)


In [None]:
def for_emoji(message):
    #функция обработки эмодзи
    message = message.replace('\\n', '')
    message = codecs.decode(message, 'unicode_escape')
    message = message.encode('utf-16', 'surrogatepass').decode('utf-16')
    return message


In [None]:
df = pd.read_excel("input.xlsx", usecols=['message'])
df['message'] = df['message'].apply(for_emoji)


In [None]:
# количество слов в сообщении
df['word_count'] = df['message'].apply(lambda x: len(str(x).split())) 
# количество уникальных слов в сообщении
df['unique_word_count'] = df['message'].apply(lambda x: len(set(str(x).split())))
# средняя длина слова в сообщении
df['mean_word_length'] = df['message'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
# количество символов в сообщении
df['char_count'] = df['message'].apply(lambda x: len(str(x)))


In [None]:
new_features = ['word_count', 'unique_word_count', 'mean_word_length', 'char_count']

fig, axes = plt.subplots(ncols=1, nrows=len(new_features), figsize=(20, 50))


for i, feature in enumerate(new_features):
    sns.histplot(df[feature], label='messages', ax=axes[i], kde=True, stat="density")
    axes[i].tick_params(axis='x', labelsize=15)
    axes[i].tick_params(axis='y', labelsize=15)
    axes[i].legend()
    
    axes[i].set_title('Распределение {0} в данных'.format(feature), fontsize=15)
    axes[i].set_xlim(0, 4 * df[feature].mean())

plt.grid()
plt.show()


In [None]:
messages = df['message'].tolist()


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
GPTNeo = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", pad_token_id=tokenizer.eos_token_id)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
GPTNeo = GPTNeo.to(device)
print(device)


In [None]:
NUM_OF_MESSAGES = 1000

def gpt_generate(max_new_tokens = MAX_LEN, do_sample = True, 
                 temperature = 1.0, top_k = 50, top_p = 1.0,
                 num_return_sequences = 1, num_beams = 1, no_repeat_ngram_size = 0,
                 early_stopping = False):
    # функция генерации текста
    generated_messages = []
    for i in trange(0, NUM_OF_MESSAGES, 1):
        input_sequence = '\n'.join(messages[i : i + 5])
        input_sequence = 'Generate messages similar to these: \n' + input_sequence
        input_ids = tokenizer(input_sequence, return_tensors="pt").input_ids
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        input_ids = input_ids.to(device)
        greedy_output = GPTNeo.generate(input_ids,
                                        max_new_tokens = max_new_tokens, do_sample = do_sample, 
                                        temperature = temperature, top_k = top_k, top_p = top_p,
                                        num_return_sequences = num_return_sequences, num_beams = num_beams, no_repeat_ngram_size = no_repeat_ngram_size,
                                        early_stopping = early_stopping)
        output_sequence = tokenizer.decode(greedy_output[0], skip_special_tokens = True)
        output_sequence = output_sequence.replace(input_sequence, '')
        output_sequence = output_sequence.replace('\n', '')
        if not output_sequence:
            output_sequence = 'NONE'
            print('NONE SEQUENCE CREATED')
        
        generated_messages.append(output_sequence),

    rouge = Rouge()
    scores = rouge.get_scores(messages[0:NUM_OF_MESSAGES], generated_messages, avg=True)
    scores['bleu'] = list_bleu(messages[0:NUM_OF_MESSAGES], generated_messages)
    return generated_messages, scores

In [None]:
output_1, scores_1 = gpt_generate(temperature = 1.0, top_k = 50, top_p = 1.0, do_sample = True)
output_2, scores_2 = gpt_generate(temperature = 1.0, top_k = 50, top_p = 0.8, do_sample = True)
output_3, scores_3 = gpt_generate(temperature = 1.0, top_k = 40, top_p = 1.0, do_sample = True)
output_4, scores_4 = gpt_generate(temperature = 1.0, top_k = 40, top_p = 0.8, do_sample = True)
output_5, scores_5 = gpt_generate(temperature = 0.8, top_k = 50, top_p = 1.0, do_sample = True)
output_6, scores_6 = gpt_generate(temperature = 0.8, top_k = 50, top_p = 0.8, do_sample = True)
output_7, scores_7 = gpt_generate(temperature = 0.8, top_k = 40, top_p = 1.0, do_sample = True)
output_8, scores_8 = gpt_generate(temperature = 0.8, top_k = 40, top_p = 0.8, do_sample = True)
output_9, scores_9 = gpt_generate(temperature = 1.0, top_k = 50, top_p = 1.0, do_sample = False, num_beams = 5, no_repeat_ngram_size = 2)


In [None]:
scores = [scores_1, scores_2, scores_3, scores_4, scores_5, scores_6, scores_7, scores_8, scores_9]
abscissa = [i + 1 for i in range(len(scores))]
plt.plot(abscissa, [s['rouge-1']['f'] for s in scores], label = 'rouge-1')
plt.plot(abscissa, [s['rouge-2']['f'] for s in scores], label = 'rouge-2')
plt.plot(abscissa, [s['rouge-l']['f'] for s in scores], label = 'rouge-l')
plt.plot(abscissa, [s['bleu']/100 for s in scores], label = 'bleu')
plt.legend()
plt.show()


In [None]:
plt.plot(abscissa, [(s['bleu']/100 + s['rouge-1']['f'] + s['rouge-2']['f'] + s['rouge-l']['f']) / 4 for s in scores], label = 'average_score')
plt.legend()
plt.show()
