# Задание

Обучить модель ML-классификатор на данных train_10000.json в различных режимах:

- с обученными на этих данных эмбеддингами
- с предобученными эмбеддингами (https://rusvectores.org/ru/models/)
- с эмбеддингами от предобученной модели

Сделать submission с результатами. Также представить jupyter notebook, в котором результаты были получены

In [9]:
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import gensim
import numpy as np
import pandas as pd
import os
import csv
import re
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

In [6]:
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [22]:
def clean_text(text):
    s = text.lower()
    s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
    s = re.sub("\s+", " ", s)
    s = s.strip()
    return s

def read_json_data(filename):
    df = pd.read_json(filename)
    texts = []
    labels = []
    for col in df.columns:
        for title, text in zip(df[col]['titles'], df[col]['texts']):
            texts.append(clean_text(title + ' ' + text))
            labels.append(col)
    df = pd.DataFrame({"label": labels, "text": texts})
    return df

In [23]:
train_df = read_json_data("archive/train_10000.json")

In [24]:
test_df = pd.read_csv("archive/test.csv")
test_df['alltext'] = test_df['title'] + ' ' + test_df['text']

In [25]:
labels_to_id = {
            'Экономика': 0, 
            'Спорт': 1, 
            'Культура': 2, 
            'Наука и техника': 3,
            'Наукаитехника': 3,
        }
id_to_labels = {
            0: 'Экономика', 
            1: 'Спорт', 
            2: 'Культура', 
            3: 'Наука и техника'
        }

## Обученные на этих данных эмбеддинги (Word2Vec)

In [14]:
w2v_model = Word2Vec(sentences=[r.split() for r in train_df.text], vector_size=100, window=5, min_count=1, workers=4)

In [19]:
sims = w2v_model.wv.most_similar('girl', topn=10)
sims

[('woman', 0.9497174620628357),
 ('are', 0.9484232068061829),
 ('your', 0.944129467010498),
 ('got', 0.9431295394897461),
 ('little', 0.9427224397659302),
 ('way', 0.941563606262207),
 ('that', 0.9404558539390564),
 ('how', 0.9403867125511169),
 ('fun', 0.9397619962692261),
 ('luck', 0.9395718574523926)]

In [21]:
train_features = []
for r in train_df.text:
    vectors = []
    for w in r:
        if w in w2v_model.wv:
            v = w2v_model.wv[w]
            vectors.append(v)
    vectors = np.array(vectors)
    feature = np.average(vectors, axis=0)
    train_features.append(feature)
train_features = np.array(train_features)

In [48]:
test_features = []
for r in test_df.alltext:
    vectors = []
    for w in r:
        if w in w2v_model.wv:
            v = w2v_model.wv[w]
            vectors.append(v)
    vectors = np.array(vectors)
    feature = np.average(vectors, axis=0)
    test_features.append(feature)
test_features = np.array(test_features)

In [32]:
train_labels = [labels_to_id[l] for l in train_df.label]

In [33]:
test_labels = pd.read_table("ground_truth.txt", delimiter=',')
test_labels = [labels_to_id[l] for l in test_labels.Category]

In [32]:
svm_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_cls.fit(train_features, train_labels)

In [49]:
preds = svm_cls.predict(test_features)
accuracy_score(test_labels, preds)

0.5611510791366906

## Предобученные эмеддинги (FastText)

In [None]:
import zipfile

!wget --no-check-certificate 'http://vectors.nlpl.eu/repository/20/213.zip' -O 213.zip


In [53]:
extract_dir = 'fasttext'
with zipfile.ZipFile('213.zip') as zf:
    zf.extractall(extract_dir)

In [54]:
model_path = os.path.join(extract_dir,'model.model')

In [57]:
model_ft_pred = gensim.models.KeyedVectors.load(model_path)

In [67]:
train_features = []
for text in train_df.text:
    vectors = []           
    vectors.append(np.zeros(model_ft_pred.vector_size))

    for word in text.split():
        if word in model_ft_pred.key_to_index:
            vector = model_ft_pred[model_ft_pred.key_to_index[word]]
            vectors.append(vector)

    vectors = np.array(vectors)       
    feature = np.average(vectors, axis=0)
    train_features.append(feature)
    
train_features = np.array(train_features)

In [68]:
test_features = []
for text in test_df.alltext:
    vectors = []           
    vectors.append(np.zeros(model_ft_pred.vector_size))

    for word in text.split():
        if word in model_ft_pred.key_to_index:
            vector = model_ft_pred[model_ft_pred.key_to_index[word]]
            vectors.append(vector)

    vectors = np.array(vectors)       
    feature = np.average(vectors, axis=0)
    test_features.append(feature)
    
test_features = np.array(test_features)

In [69]:
svm_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_cls.fit(train_features, train_labels)

In [70]:
preds = svm_cls.predict(test_features)
accuracy_score(test_labels, preds)

0.6091127098321343

## Эмбеддинги от предобученной модели (BERT multilingual base model)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
model = model.cuda()

In [26]:
train_indexed = [tokenizer.encode(r, return_tensors="pt", max_length=128, add_special_tokens=True, truncation=True) for r in train_df.text]

In [28]:
test_indexed = [tokenizer.encode(r, return_tensors="pt", max_length=128, add_special_tokens=True, truncation=True) for r in test_df.alltext]

In [30]:
train_features = []
for input_ids in tqdm(train_indexed):
    with torch.no_grad():
        emb = model(input_ids.cuda())[0]
        emb = emb.squeeze()[0]
        emb = emb.tolist()
        train_features.append(emb)

100%|█████████████████████████████████████| 36259/36259 [06:49<00:00, 88.50it/s]


In [31]:
test_features = []
for input_ids in tqdm(test_indexed):
    with torch.no_grad():
        emb = model(input_ids.cuda())[0]
        emb = emb.squeeze()[0]
        emb = emb.tolist()
        test_features.append(emb)

100%|█████████████████████████████████████████| 417/417 [00:04<00:00, 87.64it/s]


In [34]:
svm_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_cls.fit(train_features, train_labels)

In [35]:
preds = svm_cls.predict(test_features)
accuracy_score(test_labels, preds)

0.8968824940047961