In [1]:
!pip3 install stop_words
!pip3 install pymorphy2



In [2]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from collections import Counter
import random
import numpy as np
import torch
import glob
import torch.nn as nn
from functools import lru_cache
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from functools import lru_cache
from sklearn.datasets import fetch_20newsgroups

Самостоятельно обучить классификатор текстов на примере 20newsgroups
На примере 20 newsgroups попробовать разные параметры для сверток для классификации текстов

In [19]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [4]:
newsgroups_test = fetch_20newsgroups(subset='test')

text_train = newsgroups_train.data
text_test = newsgroups_test.data

In [5]:
counts = Counter()
for sequence in text_train:
    counts.update(sequence.split())

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 10:
        del counts[word]
print("num_words after:",len(counts.keys()))
    
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

num_words before: 280308
num_words after: 23722


In [6]:
class NewsgroupsDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels, w2index, used_length):
        self._txts = txts
        self._labels = labels
        self._length = used_length
        self._w2index = w2index
        
    def __len__(self):
        return len(self._txts)
    
    @lru_cache(50000)
    def encode_sentence(self, txt):
        encoded = np.zeros(self._length, dtype=int)
        enc1 = np.array([self._w2index.get(word, self._w2index["UNK"]) for word in txt.split()])
        length = min(self._length, len(enc1))
        encoded[:length] = enc1[:length]
        return encoded, length
    
    def __getitem__(self, index):
        encoded, length = self.encode_sentence(self._txts[index])
        return torch.from_numpy(encoded.astype(np.int32)), float(self._labels[index]), length

In [16]:
class Net(nn.Module):
    def __init__(self, vocab_size=20, embedding_dim = 32, out_channel = 32, num_classes = 20):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(out_channel, num_classes)
        
    def forward(self, x):        
        output = self.embedding(x)
        #                       B  F  L         
        output = output.permute(0, 2, 1)
        output = self.conv(output)
        output = self.relu(output)
        output = torch.max(output, axis=2).values
        output = self.linear(output)
#        print(output.shape)
        
        return output
    
    def predict(self, x):
        return F.softmax(self.forward(x))

In [17]:
y_train = newsgroups_train.target
y_test = newsgroups_test.target

train_dataset = NewsgroupsDataset(text_train, y_train, vocab2index, 16)
test_dataset = NewsgroupsDataset(text_test, y_test, vocab2index, 16)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=2,
                          shuffle=True,
                          num_workers=3)
test_loader = torch.utils.data.DataLoader(test_dataset,
                          batch_size=2,
                          shuffle=False,
                          num_workers=1)

  cpuset_checked))


In [18]:
model = Net(vocab_size=len(vocab2index))

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

model.train()
for epoch in tqdm_notebook(range(10)):  
    model.train()
    for i, data in enumerate(train_loader, 0):
        inputs, labels, lengths = data[0], data[1], data[2]
        inputs = inputs.long()
        labels = labels.view(-1, 1)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()
        
    model.eval()
    loss_accumed = 0
    for X, y, lengths in test_loader:
        X = X.long()
        y = y.view(-1, 1)
        output = model(X)
        loss = criterion(output, torch.max(labels, 1)[1])
        loss_accumed += loss
    print("Epoch {} valid_loss {}".format(epoch, loss_accumed))

print('Training is finished!')

Net(
  (embedding): Embedding(23724, 32)
  (conv): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (linear): Linear(in_features=32, out_features=20, bias=True)
)
Parameters: 762932


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


Epoch 0 valid_loss 3.159044581479975e-06
Epoch 1 valid_loss 1.9073477233177982e-06
Epoch 2 valid_loss 2.9802316703353426e-07
Epoch 3 valid_loss 0.0
Epoch 4 valid_loss 0.0
Epoch 5 valid_loss 0.0
Epoch 6 valid_loss 0.0
Epoch 7 valid_loss 0.0
Epoch 8 valid_loss 0.0
Epoch 9 valid_loss 0.0
Training is finished!
