Задание 1. Сравнить качество методов векторизации CountVectorizer и TF-IDF на примере задачи классификации текстов. Сделать выводы.

Датасет:

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

Темы: alt.atheism, misc.forsale, soc.religion.christian, talk.politics.mideast

Алгоритм МО: логистическая регрессия

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

categories = ['alt.atheism', 'misc.forsale', 'soc.religion.christian', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

model = LogisticRegression()

pipeline_count = make_pipeline(count_vectorizer, model)
pipeline_tfidf = make_pipeline(tfidf_vectorizer, model)

pipeline_count.fit(newsgroups_train.data, newsgroups_train.target)
pipeline_tfidf.fit(newsgroups_train.data, newsgroups_train.target)

predictions_count = pipeline_count.predict(newsgroups_test.data)
predictions_tfidf = pipeline_tfidf.predict(newsgroups_test.data)

report_count = classification_report(newsgroups_test.target, predictions_count, target_names=newsgroups_test.target_names)
report_tfidf = classification_report(newsgroups_test.target, predictions_tfidf, target_names=newsgroups_test.target_names)
print("Классификация с использованием CountVectorizer:\n", report_count)
print("Классификация с использованием TF-IDF:\n", report_tfidf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Классификация с использованием CountVectorizer:
                         precision    recall  f1-score   support

           alt.atheism       0.60      0.83      0.70       319
          misc.forsale       1.00      0.42      0.59       390
soc.religion.christian       0.73      0.94      0.83       398
 talk.politics.mideast       0.84      0.82      0.83       376

              accuracy                           0.75      1483
             macro avg       0.79      0.75      0.74      1483
          weighted avg       0.80      0.75      0.74      1483

Классификация с использованием TF-IDF:
                         precision    recall  f1-score   support

           alt.atheism       0.94      0.79      0.86       319
          misc.forsale       0.90      0.99      0.94       390
soc.religion.christian       0.86      0.95      0.91       398
 talk.politics.mideast       0.98      0.90      0.94       376

              accuracy                           0.92      1483
          

# Вывод

tf-idf показывает около идеальное качество классификации

Задание 2. На примере задачи классификации текстов определить насколько предобработка текста (стемминг, лемматизация, стоп-слова и т.д.) влияет на качество обучения модели. Сделать выводы.  

Датасет:

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

Темы: alt.atheism, misc.forsale, soc.religion.christian, talk.politics.mideast

Алгоритм МО: логистическая регрессия

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text, use_stemming=True, use_stopwords=True):
    text = re.sub("[^a-zA-Z]", " ", text.lower())
    
    words = word_tokenize(text)

    if use_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

    if use_stemming:
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

categories = ['alt.atheism', 'misc.forsale', 'soc.religion.christian', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

texts_preprocessed = [preprocess_text(text) for text in newsgroups_train.data]

vectorizer_preprocessed = CountVectorizer()
X_train_preprocessed = vectorizer_preprocessed.fit_transform(texts_preprocessed)
y_train = newsgroups_train.target

model_preprocessed = LogisticRegression(max_iter=5000)
model_preprocessed.fit(X_train_preprocessed, y_train)

vectorizer_raw = CountVectorizer()
X_train_raw = vectorizer_raw.fit_transform(newsgroups_train.data)

model_raw = LogisticRegression(max_iter=5000)
model_raw.fit(X_train_raw, y_train)

X_test_preprocessed = vectorizer_preprocessed.transform([preprocess_text(text) for text in newsgroups_test.data])
X_test_raw = vectorizer_raw.transform(newsgroups_test.data)
y_test = newsgroups_test.target

report_preprocessed = classification_report(y_test, model_preprocessed.predict(X_test_preprocessed), target_names=newsgroups_test.target_names)
report_raw = classification_report(y_test, model_raw.predict(X_test_raw), target_names=newsgroups_test.target_names)

print("Отчет о классификации с предобработкой:\n", report_preprocessed)
print("Отчет о классификации без предобработки:\n", report_raw)


[nltk_data] Downloading package punkt to /home/qtr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/qtr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Отчет о классификации с предобработкой:
                         precision    recall  f1-score   support

           alt.atheism       0.90      0.82      0.86       319
          misc.forsale       0.94      0.99      0.97       390
soc.religion.christian       0.88      0.96      0.92       398
 talk.politics.mideast       0.98      0.90      0.94       376

              accuracy                           0.92      1483
             macro avg       0.93      0.92      0.92      1483
          weighted avg       0.93      0.92      0.92      1483

Отчет о классификации без предобработки:
                         precision    recall  f1-score   support

           alt.atheism       0.89      0.84      0.87       319
          misc.forsale       0.95      0.99      0.97       390
soc.religion.christian       0.89      0.97      0.93       398
 talk.politics.mideast       0.97      0.88      0.92       376

              accuracy                           0.93      1483
             mac

# Вывод

конечно, с текст предобработкой лучше, но для нашего случая особо сильной разницы нет

Задание 3. Сравнить качество обучения классических методов машинного обучения и методов глубокого обучения на примере задачи классификации текстов. Сделать выводы.

Датасет:

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

Темы: alt.atheism, misc.forsale, soc.religion.christian, talk.politics.mideast

In [7]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
categories = ['alt.atheism', 'misc.forsale', 'soc.religion.christian', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)


In [3]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [4]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_pred = nb_classifier.predict(X_test)
print("Наивный Байесовский классификатор:\n", classification_report(y_test, nb_pred))

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_pred = lr_classifier.predict(X_test)
print("Логистическая регрессия:\n", classification_report(y_test, lr_pred))

linreg_classifier = LinearRegression()
linreg_classifier.fit(X_train, y_train)
linreg_pred = np.rint(linreg_classifier.predict(X_test)).astype(int)
print("Линейная регрессия:\n", classification_report(y_test, linreg_pred))

svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_pred = svm_classifier.predict(X_test)
print("SVM:\n", classification_report(y_test, svm_pred))

Наивный Байесовский классификатор:
               precision    recall  f1-score   support

           0       0.91      0.90      0.90       319
           1       0.98      0.98      0.98       390
           2       0.91      0.97      0.94       398
           3       0.98      0.92      0.95       376

    accuracy                           0.95      1483
   macro avg       0.95      0.94      0.94      1483
weighted avg       0.95      0.95      0.95      1483



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Логистическая регрессия:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86       319
           1       0.95      0.99      0.97       390
           2       0.88      0.97      0.92       398
           3       0.97      0.88      0.92       376

    accuracy                           0.92      1483
   macro avg       0.92      0.92      0.92      1483
weighted avg       0.92      0.92      0.92      1483



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Линейная регрессия:
               precision    recall  f1-score   support

          -8       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
          -1       0.00      0.00      0.00         0
           0       0.80      0.38      0.51       319
           1       0.56      0.81      0.66       390
           2       0.55      0.69      0.61       398
           3       0.70      0.34      0.46       376
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0

    accuracy                           0.57      1483
   macro avg       0.19      0.16      0.16      1483
weighted avg       0.65      0.57      0.57      1483

SVM:

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
categories = ['alt.atheism', 'misc.forsale', 'soc.religion.christian', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
vectorizer = CountVectorizer()

In [3]:
vectorizer = CountVectorizer() 
X_train = vectorizer.fit_transform(newsgroups_train.data).toarray()
X_test = vectorizer.transform(newsgroups_test.data).toarray()
y_train = newsgroups_train.target
y_test = newsgroups_test.target


X_train_tensor = torch.tensor(X_train).float()
X_test_tensor = torch.tensor(X_test).float()
y_train_tensor = torch.tensor(y_train).long()
y_test_tensor = torch.tensor(y_test).long()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=os.cpu_count())
test_loader = DataLoader(test_dataset, batch_size=128, num_workers=os.cpu_count())

In [4]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.rnn(x)
        if out.dim() >= 2:
            out = out.squeeze().unsqueeze(1).squeeze(1)
        #out = out[:, -1, :]
        out = self.fc(out)
        return out

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        if out.dim() >= 2:
            out = out.squeeze().unsqueeze(1).squeeze(1)
        #out = out[:, -1, :]
        out = self.fc(out)
        return out

class BiRNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiRNNModel, self).__init__()
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        out, _ = self.rnn(x)
        if out.dim() >= 2:
            out = out.squeeze().unsqueeze(1).squeeze(1)
        #out = out[:, -1, :]
        out = self.fc(out)
        return out

class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        if out.dim() >= 2:
            out = out.squeeze().unsqueeze(1).squeeze(1)
        #out = out[:, -1, :]
        out = self.fc(out)
        return out

In [5]:
def train_and_evaluate(model, train_loader, test_loader, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    model.train()
    for epoch in range(6): 
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(texts)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    total = 0
    correct = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = 100 * correct / total
    print(f'Точность модели на тестовых данных: {accuracy}%')

    print("Classification Report:\n", classification_report(all_labels, all_predictions))

    return accuracy

In [6]:
input_dim = X_train.shape[1]  
hidden_dim = 128
output_dim = len(set(y_train))  

models = {
    "RNN": RNNModel(input_dim, hidden_dim, output_dim).to(device),
    "LSTM": LSTMModel(input_dim, hidden_dim, output_dim).to(device),
    "BiRNN": BiRNNModel(input_dim, hidden_dim, output_dim).to(device),
    "BiLSTM": BiLSTMModel(input_dim, hidden_dim, output_dim).to(device)
}


for model_name, model in models.items():
    print(f"Обучение и оценка {model_name} модели")
    accuracy = train_and_evaluate(model, train_loader, test_loader, device)
    print(f"{model_name} Точность: {accuracy}%\n")


Обучение и оценка RNN модели
Точность модели на тестовых данных: 93.39177343223196%
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.84      0.89       319
           1       0.97      0.99      0.98       390
           2       0.87      0.98      0.92       398
           3       0.97      0.91      0.94       376

    accuracy                           0.93      1483
   macro avg       0.94      0.93      0.93      1483
weighted avg       0.94      0.93      0.93      1483

RNN Точность: 93.39177343223196%

Обучение и оценка LSTM модели
Точность модели на тестовых данных: 93.66149696561025%
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       319
           1       0.96      0.99      0.98       390
           2       0.91      0.97      0.94       398
           3       0.97      0.90      0.93       376

    accuracy                           0.9

# Вывод

методы глубокого обучения показали себя значительно лучше, также использованние двухнаправленных rnn и lstm 

дало чуть лучшие реузльтаты по сравнению с классическими rnn и lstm

Задание 4. Используя модель Word2vec 

постройте эмбеддинги и визуализируйте их. Сделать выводы.

In [None]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import re
import os
import numpy as np
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize(text):
    return word_tokenize(text, language='russian')

with open('dostoevsky75.txt', 'r', encoding='utf-8') as file:
    dostoevsky_text = preprocess_text(file.read(10000))

with open('nietzsche.txt', 'r', encoding='utf-8') as file:
    nietzsche_text = preprocess_text(file.read(10000))

dostoevsky_tokens = tokenize(dostoevsky_text)
nietzsche_tokens = tokenize(nietzsche_text)

texts = [dostoevsky_tokens, nietzsche_tokens]

model = Word2Vec(texts, vector_size=100, window=5, min_count=1, workers=os.cpu_count())

def tsne_plot(model):
    labels = []
    tokens = []

    for word in model.wv.key_to_index:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(24, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

tsne_plot(model)

# вывод

русское слово "ясно" и его формы,  а также англоязычное слово "up" встречались чаще всего

Задание 5. Сравнить две модели трансформеров на примере машинного перевода. 

Перевод следующий: русский -> английский -> испанский -> арабский -> русский. Сделать выводы.

#удачи скачать это столько моделей(я про хельсинки)

In [13]:
from transformers import MarianMTModel, MarianTokenizer
import torch

def translate_helsinki(text, model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    translated = model.generate(**inputs)

    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]

text = "Допустим, я тут что-то должен написать оригинальное. Я люблю хинкали и не только"

translated_helsinki = translate_helsinki(text, "Helsinki-NLP/opus-mt-ru-en")
translated_helsinki = translate_helsinki(translated_helsinki, "Helsinki-NLP/opus-mt-en-es")
translated_helsinki = translate_helsinki(translated_helsinki, "Helsinki-NLP/opus-mt-es-ar")
final_helsinki = translate_helsinki(translated_helsinki, "Helsinki-NLP/opus-mt-ar-ru")

print("Оригинальный текст:", text)
print("Перевод через Helsinki-NLP:", final_helsinki)


Оригинальный текст: Допустим, я тут что-то должен написать оригинальное. Я люблю хинкали и не только
Перевод через Helsinki-NLP: Скажем так, я должен написать что-то оригинальное.


In [12]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch

def translate(text, model_name, src_lang, tgt_lang, device):
    tokenizer = M2M100Tokenizer.from_pretrained(model_name)
    model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

text = "Допустим, я тут что-то должен написать оригинальное. Я люблю хинкали и не только"
model_name_facebook = "facebook/m2m100_418M"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

translated_facebook = translate(text, model_name_facebook, "ru", "en", device)
translated_facebook = translate(translated_facebook, model_name_facebook, "en", "es", device)
translated_facebook = translate(translated_facebook, model_name_facebook, "es", "ar", device)
translated_facebook = translate(translated_facebook, model_name_facebook, "ar", "ru", device)

print("Оригинальный текст:", text)
print("Перевод с использованием Facebook:", translated_facebook)


Оригинальный текст: Допустим, я тут что-то должен написать оригинальное. Я люблю хинкали и не только
Перевод с использованием Facebook: Мы признаем, что мне нужно что-то написать здесь.Мне нравятся углы, а не только


Вывод: обе модели справились не очень, но шизофазия от фейсбука мне понравилась больше

Задание 6. Используя трансформеры сделать генерацию русскоязычного текста. 

Дообучить трансформер текстами Достоевского и повторить генерацию. Сделать выводы.

In [1]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = GPT2LMHeadModel.from_pretrained("ai-forever/rugpt3small_based_on_gpt2").to(device)

text_to_generate = input("Введите строку: ")

input_ids = tokenizer.encode(text_to_generate, return_tensors='pt').to(device)

output = model.generate(input_ids, max_length=512, num_return_sequences=1, temperature=0.7, do_sample=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


2023-12-31 23:33:48.175238: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-31 23:33:48.175278: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-31 23:33:48.175316: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-31 23:33:48.182314: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Введите строку:  Я люблю хинкали и кавказкую кухню


Я люблю хинкали и кавказкую кухню.


21358947	sozero	2019-12-03 01:37:00	Пора уходить из страны, у которой уже нет денег 




















































































































































































































































































































































































































































































In [None]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = GPT2LMHeadModel.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model.to(device)

file_path = "dostoevskysmall.txt"
file_path2 =file_path[:70000]

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=file_path2,
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./results', 
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=500,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

In [None]:
output_model_dir = "./fine_tuned_gpt2"  

import os
if not os.path.exists(output_model_dir):
    os.makedirs(output_model_dir)

model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print("Дообученная модель ru gpt3 small on GPT-2 сохранена в", output_model_dir)

In [None]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

text_to_generate = input("Введите строку: ")

input_ids = tokenizer.encode(text_to_generate, return_tensors='pt').to(device)

output = model.generate(input_ids, max_length=512, num_return_sequences=1, temperature=0.7, do_sample=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)