In [1]:
import re
import nltk.data
import numpy as np
import pandas as pd
import random
import json

from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

## `1. Сбор данных`
___

In [3]:
embedding_size = 300

In [4]:
all_id_texts = pd.read_csv('train_data.csv')
all_id_labels = pd.read_csv('train_solution.csv')

test_id_texts = pd.read_csv('test_data.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [5]:
all_id_labels['category'].value_counts()

0    1428
2    1217
1    1199
Name: category, dtype: int64

### `1.1 Crypto`
___

In [6]:
crypto = []
with open('crypto.json', errors="ignore") as f:
    crypto = json.load(f)

In [7]:
crypto = np.array(pd.DataFrame(crypto)['messages'])

In [8]:
crypto_texts = []
for i in range(len(crypto)):
    try:
        val = crypto[i]['text'][0]
        if not type(val) is dict:
            crypto_texts += [val]
    except:
        pass

In [9]:
np.random.seed(1)

crypto_texts = np.array(crypto_texts)
crypto_texts = crypto_texts[np.random.randint(0, len(crypto_texts), 4000)]

In [10]:
crypto_message = pd.DataFrame()
crypto_message['id'] = ids = [i + 1 for i in range(len(crypto_texts))]
crypto_message['message'] = crypto_texts

In [11]:
crypto_message

Unnamed: 0,id,message
0,1,Lithuania Calls ICOâ€™s as Securities as it Is...
1,2,XRP Whales Moves 672 Million tokens; Will This...
2,3,Ripple(XRP) Scores Big in Latest Hearing as Ju...
3,4,"This is Why Bitcoin, Bitcoin Cash & Crypto Mar..."
4,5,Quant Price Analysis- Rising QNT Price Trigger...
...,...,...
3995,3996,Are you fanatic about getting the latest updat...
3996,3997,"Ethereum (ETH) Tests Key Support Level, Is $17..."
3997,3998,Bitcoin [BTC] Price Analysis: On The Verge of ...
3998,3999,"Bitcoin [BTC] Jumps $7,000 Mark, Hereâ€™s Why ..."


In [12]:
crypto_category = pd.DataFrame()
crypto_category['id'] = ids = [i + 1 for i in range(len(crypto_texts))]
crypto_category['category'] = 1

In [13]:
crypto_category

Unnamed: 0,id,category
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
3995,3996,1
3996,3997,1
3997,3998,1
3998,3999,1


### `1.2 Beauty`
___

In [14]:
tweets = []
for line in open('news.json', 'r'):
    text = json.loads(line)
    if text['category'] == 'STYLE & BEAUTY':
        tweets.append(text['short_description'])

In [15]:
np.random.seed(1)

tweets = np.array(tweets)
tweets = tweets[np.random.randint(0, len(tweets), 4000)]

In [16]:
beauty_message = pd.DataFrame()
beauty_message['id'] = ids = [i + 4001 for i in range(len(tweets))]
beauty_message['message'] = tweets

In [17]:
beauty_message

Unnamed: 0,id,message
0,4001,Maggie Gyllenhaal Kelly was spotted in Easter ...
1,4002,Since taking over the design reins for the men...
2,4003,If you want sophisticated gowns with understat...
3,4004,Want more? Be sure to check out Stylelist on T...
4,4005,Miranda in the dress in April 2013: When the a...
...,...,...
3995,7996,“I’m always looking for new and innovative mat...
3996,7997,"The most stunning, yet scary manicures we've s..."
3997,7998,Back when MTV actually played music videos.
3998,7999,Because of the black cats... get it?


In [18]:
beauty_category = pd.DataFrame()
beauty_category['id'] = ids = [i + 4001 for i in range(len(tweets))]
beauty_category['category'] = 2

In [19]:
beauty_category

Unnamed: 0,id,category
0,4001,2
1,4002,2
2,4003,2
3,4004,2
4,4005,2
...,...,...
3995,7996,2
3996,7997,2
3997,7998,2
3998,7999,2


### `1.3 Study`
___

In [20]:
study = pd.read_csv('study.csv')

In [21]:
np.random.seed(1)

study = study.iloc[np.random.randint(0, len(study), 4000), :]

In [22]:
study_message = pd.DataFrame()
study_message['id'] = ids = [i + 8001 for i in range(len(study))]
study_message['message'] = study['message'].values

In [23]:
study_message

Unnamed: 0,id,message
0,8001,p-stable distributions
1,8002,I looked at the NLP
2,8003,Is the report published somewhere?
3,8004,"10 doesn't work for me)\n\nNo, I scored on it,..."
4,8005,and be offended that no one writes
...,...,...
3995,11996,Any questions?
3996,11997,tomorrow's momo joke for u?
3997,11998,"If Ildus moves to Italy, it will be ildus da-s..."
3998,11999,Opened the textbook and everything


In [24]:
study_category = pd.DataFrame()
study_category['id'] = ids = [i + 8001 for i in range(len(study))]
study_category['category'] = 0

In [25]:
study_category

Unnamed: 0,id,category
0,8001,0
1,8002,0
2,8003,0
3,8004,0
4,8005,0
...,...,...
3995,11996,0
3996,11997,0
3997,11998,0
3998,11999,0


### `1.4 Объединение всех таблиц`
___

In [26]:
all_id_texts = pd.concat([all_id_texts, beauty_message, crypto_message, study_message])

In [27]:
all_id_labels = pd.concat([all_id_labels, beauty_category, crypto_category, study_category])

In [28]:
all_id_labels['category'].value_counts()

0    5428
2    5217
1    5199
Name: category, dtype: int64

In [29]:
len(all_id_texts)

15844

In [30]:
all_id_texts.head()

Unnamed: 0,id,message
0,271828,Over $616 million in Bitcoin was electrocated ...
1,271829,Quiz: Thursday or friday?
2,271830,The Australian Revenue Authority will start co...
3,271831,Let's continue😉. I present to you my new review
4,271832,Here comes your future palette.


In [31]:
len(all_id_labels)

15844

In [32]:
all_id_labels.head()

Unnamed: 0,id,category
0,271828,1
1,271829,0
2,271830,1
3,271831,2
4,271832,2


___

In [33]:
len(test_id_texts)

5927

In [34]:
test_id_texts.head()

Unnamed: 0,id,message
0,275672,But a lot of people have a job fair tonight.
1,275673,"Also, I got only 4 answers on the google form ..."
2,275674,"Vladimir, when will we have seminar?"
3,275675,"A couple at 111, too?"
4,275676,"It's on Anti-buying. And again, Zara:"


In [35]:
len(sample_submission)

5927

In [36]:
sample_submission.head()

Unnamed: 0,id,category
0,275672,0
1,275673,2
2,275674,0
3,275675,1
4,275676,1


## `2. Разделение на обучающую и валидационную выборки`
___

In [37]:
train_id_labels, validation_id_labels = train_test_split(all_id_labels, test_size=0.2, random_state=1, stratify=all_id_labels['category'])

In [38]:
len(train_id_labels), len(validation_id_labels)

(12675, 3169)

In [39]:
train_id_labels = train_id_labels.sort_values(by=['id'])
validation_id_labels = validation_id_labels.sort_values(by=['id'])

In [40]:
mask = all_id_labels['id'].isin(train_id_labels['id'].values)

train_id_texts = all_id_texts[mask]
validation_id_texts = all_id_texts[~mask]

## `3. Токенизация`
___

In [41]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [55]:
def review_to_wordlist(review, remove_stopwords=False):
    # remove links
    review = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", review)
    # get the text from the page
    review_text = BeautifulSoup(review, "lxml").get_text()
    # leave only words
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # convert to lowercase and split into words using space character
    words = review_text.lower().split()
    if remove_stopwords: # remove stopwords
        stops = stopwords.words("english")
        words = [w for w in words if not w in stops]
    return(words)

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # break the review oto sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # apply the function to each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [56]:
train_sentences = []

print("Parsing sentences from training set...")
for review in tqdm(train_id_texts['message']):
    train_sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set...


100%|██████████| 12675/12675 [00:11<00:00, 1081.17it/s]


In [57]:
print(len(train_sentences))
print(train_sentences[:2])

24627
[['over', 'million', 'in', 'bitcoin', 'was', 'electrocated', 'in', 'september', 'with', 'wrapped', 'bitcoin', 'wbtc', 'data', 'from', 'coindesk'], ['the', 'emissions', 'increased', 'by', 'more', 'than', 'per', 'cent', 'compared', 'to', 'august', 'when', 'million', 'was', 'currentized']]


## `4. Построение Word2Vec-модели`
___

In [93]:
from gensim.models import word2vec

In [94]:
word2vec_model = \
    word2vec.Word2Vec(
        train_sentences,
        workers=4,
        vector_size=embedding_size,
        min_count=15,
        window=10,
        sample=1e-3,
        epochs=100
    )

### `4.1 Задание №2 для отчета`

**Задание:** Необходимо получить представление для фразы `"My future"` в построенным мною признаковом пространстве (эмбеддинги токенов), а затем найти ближайший пример из обучающей выборки.
___

In [125]:
sum_of_tokens = (word2vec_model.wv['my'] + word2vec_model.wv['future']) / 2
sum_of_tokens.shape

(300,)

In [126]:
word2vec_model.wv.most_similar(positive=sum_of_tokens, topn=10)

[('my', 0.6688421964645386),
 ('future', 0.5974202752113342),
 ('upside', 0.2832581400871277),
 ('myself', 0.2531781494617462),
 ('i', 0.2522510886192322),
 ('dogs', 0.2446313202381134),
 ('pain', 0.2443465143442154),
 ('afraid', 0.22892029583454132),
 ('anymore', 0.22891657054424286),
 ('anywhere', 0.22476759552955627)]

**Комментарий:**

Составив фразу `"My future"`, как усредненную сумму эмбеддингов слов `"my"` и `"future"`, ближайшими нетривиальными примерами из обучающей выборки оказались следующие 4 слова:

* `upside` (от англ. потенциал, преимущство) - *видимо я раскрою свой потенциал* 
* `dogs` (от англ. собачки) - *ура, у меня появится собачка, да не одна, сейчас никого нету и не хватает как раз(* 
* `pain` (от англ. боль) - *вот это меня пугает) надеюсь это предсказание ошибочно*
* `afraid` (от англ. страх) - *я уже натерпелся страха с пересдачами (я все пересдал слава богу), надеюсь это предсказание также ошибочно*

### `4.2 Тестирование построенных токенов`
___

In [127]:
word2vec_model.corpus_count, len(train_sentences)

(24627, 24627)

In [128]:
word2vec_model.corpus_total_words

340239

In [129]:
len(word2vec_model.wv), len(word2vec_model.wv.index_to_key)

(2507, 2507)

In [130]:
word2vec_model.wv.index_to_key[:5]

['the', 'to', 'of', 'and', 'a']

In [131]:
word2vec_model.wv.index_to_key[0], word2vec_model.wv.index_to_key[2262]

('the', 'ema')

In [132]:
word = 'bitcoin'

try:
    print(len(word2vec_model.wv[word]))
except:
    print('Doens\'t have this word')

300


In [133]:
word = 'hololens'

try:
    print(len(word2vec_model.wv[word]))
except:
    print('Doens\'t have this word')

Doens't have this word


In [134]:
word2vec_model.wv.most_similar(positive='bitcoin', topn=5)

[('soars', 0.3881227374076843),
 ('eth', 0.3777061700820923),
 ('cryptivate', 0.3640345335006714),
 ('crypto', 0.36187246441841125),
 ('losses', 0.3488355576992035)]

## `5. Формирование таблиц (train, validation, test) с эмбеддинговыми описаниями текстов`
___

In [None]:
def create_embeddings_by_texts(id_texts):
    all_emb = None
    for id in tqdm(id_texts['id']):
        mask = (id_texts['id'] == id)
        text = id_texts[mask]['message'].values[0]
        text_tokens = review_to_wordlist(text, tokenizer)
        text_emb = np.zeros(embedding_size)

        received_tokens_amount = 0
        for token in text_tokens:
            try: 
                text_emb += word2vec_model.wv[token]
                received_tokens_amount += 1
            except:
                pass

        if received_tokens_amount != 0:
            text_emb = text_emb / received_tokens_amount

        text_emb = np.concatenate(([id], text_emb))
        text_emb = text_emb.reshape(1, embedding_size + 1)

        if all_emb is None:
            all_emb = text_emb
        else:
            all_emb = np.concatenate((all_emb, text_emb))

        df = pd.DataFrame(all_emb, columns=['id'] + [i for i in range(embedding_size)])
        df['id'] = df['id'].astype(int)
    return df.copy()

In [None]:
train_emb = create_embeddings_by_texts(train_id_texts)

In [None]:
validation_emb = create_embeddings_by_texts(validation_id_texts)

In [None]:
test_emb = create_embeddings_by_texts(test_id_texts)

In [None]:
X_train = train_emb.iloc[:, 1:]
y_train = train_id_labels['category'].values

X_train.shape, y_train.shape

In [None]:
X_val = validation_emb.iloc[:, 1:]
y_val = validation_id_labels['category'].values

X_val.shape, y_val.shape

In [None]:
X_test = test_emb.iloc[:, 1:]

X_test.shape

## `6. Предсказание на CatBoost`
___

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(verbose=True, random_seed=1)
model.fit(X_train, y_train)

In [None]:
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

In [None]:
pred_train = pred_train.reshape(-1)
pred_val = pred_val.reshape(-1)
pred_test = pred_test.reshape(-1)

In [None]:
y_train.shape, pred_train.shape

In [None]:
y_val.shape, pred_val.shape

In [None]:
print('Train acc.:\t {:.4f}'.format((pred_train == y_train).mean()))
print('Val acc.:\t {:.4f}'.format((pred_val == y_val).mean()))
print()

In [None]:
model.save_model('cat_boost_model')

## `7. Предсказание на RNN`
___

In [None]:
from torch import nn
from matplotlib import pyplot as plt 
from IPython.display import clear_output

import torch
import torch.nn.functional as F

In [None]:
def plot_graphics(accs_train_epoch, accs_val_epoch, losses_train_epoch, losses_val_epoch=None):
    fig, axs = plt.subplots(1, 2, figsize=(13, 4))

    axs[0].plot(range(1, len(accs_train_epoch) + 1), accs_train_epoch, label='train', color='blue', alpha=0.9)
    axs[0].plot(range(1, len(accs_val_epoch) + 1), accs_val_epoch, label='val', color='green', alpha=0.9)
    axs[0].set_ylabel('accuracy', fontsize=12)

    axs[1].plot(range(1, len(losses_train_epoch) + 1), losses_train_epoch, label='train', color='blue', alpha=0.9)
    axs[1].set_ylabel('loss', fontsize=12)

    for ax in axs:
        ax.grid()
        ax.set_xlabel('epochs', fontsize=12)
        ax.legend()
        
    plt.show()

In [None]:
def text_tokens_to_embs(text_tokens):
    text_embs = []

    for token in text_tokens:
        try:
            text_embs += [word2vec_model.wv[token].tolist()]
        except:
            pass
        
    return text_embs

In [None]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        ###------------------------------------------------------###
        self.fc_x = nn.Linear(input_size, hidden_size)
        self.fc_h = nn.Linear(hidden_size, hidden_size)
        ###------------------------------------------------------###

    def forward(self, input, hidden):
        return torch.tanh(self.fc_x(input) + self.fc_h(hidden))

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
def train(optimizer, text_embs, category_tensor, category):
    hidden = rnncell.initHidden()

    rnncell.zero_grad()
    classifier.zero_grad()

    ###---------------------------------------------###
    for i in range(len(text_embs)):
        hidden = rnncell.forward(torch.tensor(text_embs[i]), hidden)
        
    output = classifier(hidden)
    ###---------------------------------------------###

    loss = F.nll_loss(output, category_tensor)
    loss.backward()
    optimizer.step()
    
    pred_category = output.exp().argmax()
    pred_category = pred_category.numpy()
    
    acc = (pred_category == category)

    return loss.item(), acc

In [None]:
def validation(text_embs, category_tensor, category):
    hidden = rnncell.initHidden()

    ###---------------------------------------------###
    for i in range(len(text_embs)):
        hidden = rnncell.forward(torch.tensor(text_embs[i]), hidden)
        
    output = classifier(hidden)
    ###---------------------------------------------###
    
    pred_category = output.exp().argmax()
    pred_category = pred_category.numpy()

    return pred_category

___

In [None]:
set_random_seed(42)

n_emb = 300
n_hidden = 128 # BE HAPPY TO INCREASE
n_categories = 3

rnncell = RNNCell(n_emb, n_hidden)
classifier = nn.Sequential(nn.Linear(n_hidden, n_categories), nn.LogSoftmax(dim=1))
params = list(rnncell.parameters()) + list(classifier.parameters())

opt = torch.optim.Adam(params, lr=1e-3)

In [None]:
# rnncell = torch.load('rnn_model')

In [None]:
epochs = 20
n_texts = len(train_id_texts)
true_val = validation_id_labels['category'].values

accs_train_epoch, accs_val_epoch = [], []
losses_train_epoch, losses_val_epoch = [], []

In [None]:
best_acc_val = 0

for epoch in tqdm(range(epochs)):    
    ### RESET
    current_loss, current_acc = 0, 0
    
    ### TRAIN
    for id in train_id_texts['id']:
        text = train_id_texts[train_id_texts['id'] == id]['message'].values[0]

        category = train_id_labels[train_id_labels['id'] == id]['category'].values[0]
        category_tensor = torch.tensor([category])

        text_tokens = review_to_wordlist(text)
        text_embs = text_tokens_to_embs(text_tokens)

        loss, acc = train(opt, text_embs, category_tensor, category)

        current_loss += loss
        current_acc += acc
        
    ### TRAIN: CALCULATE METRICS
    accs_train_epoch += [current_acc / n_texts]
    losses_train_epoch += [current_loss / n_texts]
    
    ### RESET
    current_loss, current_acc = 0, 0
    
    ### VALIDATION
    pred_val = []
    for id in validation_id_texts['id']:
        text = validation_id_texts[validation_id_texts['id'] == id]['message'].values[0]

        category = validation_id_labels[validation_id_labels['id'] == id]['category'].values[0]
        category_tensor = torch.tensor([category])

        text_tokens = review_to_wordlist(text)
        text_embs = text_tokens_to_embs(text_tokens)

        pred_val += [validation(text_embs, category_tensor, category)]
        
    cur_acc_val = (pred_val == true_val).mean()
    accs_val_epoch += [cur_acc_val]
    
    if cur_acc_val > best_acc_val:
        best_acc_val = cur_acc_val
        torch.save(rnncell, 'rnn__big_epoch_{0}'.format(epoch))
    
    clear_output()
    plot_graphics(accs_train_epoch, accs_val_epoch, losses_train_epoch)

___

In [None]:
def prediction(text_embs):
    hidden = rnncell.initHidden()

    ###---------------------------------------------###
    for i in range(len(text_embs)):
        hidden = rnncell.forward(torch.tensor(text_embs[i]), hidden)
        
    output = classifier(hidden)
    ###---------------------------------------------###
    
    pred_category = output.exp().argmax()
    pred_category = pred_category.numpy()

    return pred_category

In [None]:
pred_test = []

for id in tqdm(test_id_texts['id']):
    text = test_id_texts[test_id_texts['id'] == id]['message'].values[0]

    text_tokens = review_to_wordlist(text)
    text_embs = text_tokens_to_embs(text_tokens)

    pred_test += [prediction(text_embs)]

## `8. Выгрузка сабмита`
___

In [None]:
len(pred_test)

In [None]:
sample_submission['category'] = pred_test

In [None]:
sample_submission

In [None]:
sample_submission.to_csv('results.csv', index=False)

___