In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unit-3-nlp-txt-classification/sample_submission.csv
/kaggle/input/unit-3-nlp-txt-classification/train.csv
/kaggle/input/unit-3-nlp-txt-classification/test.csv


**Общее описание решения и вывод:**

1. При построении своей архитектуры использовал 2 вида RNN: Long Short-Term Memory (LSTM) и Gated Recurrent Units (GRU)

2. Библиотека: PyTorch

3. Рассматривал undirectional и bidirectional модели

4. Был выполнен предварительный анализ текстов, на основании его результатов выбраны методы предобработки, связанные с очисткой текста от спецсимволов, знаков препинания, смайлов и др. Полное перечисление шагов предобработки содержится в описании ячейки.

5. Анализ результатов влияния предобработки текста на результат показали, что лемматизация (независимо от количества лемматизируемых частей речи) ухудшает метрику, поэтому было принято решения отключить ее

6. Первоначально характеристики модели подбирались на одном наборе датасетов train, valid, test в соотношении 0.7-0.2-0.1 с фиксированным RandomSeed. Это позволяло прогонять за раз несколько вариантов моделей с разными гиперпараметрами и сравнивать метрики на test между собой. При этом обучение одной модели на GPU T4 x2 от kaggle в таких условиях занимало 2-3мин на эпоху для LSTM и 2-3мин на эпоху для GRU, что позволяло реализовывать перебор выделенными kaggle ресурсами.

7. При обучении моделей применялся lr_scheduler с линейным изменением шага, а также L2-регуляризация (через задание weight_decay > 0 в Adam-оптимизаторе. Согласно https://stackoverflow.com/questions/42704283/l1-l2-regularization-in-pytorch реализация weight_decay в PyTorch фактически соответствует L2-регуляризации) и dropout. dropout_rate был подобран в ходе экспериментов с моделями. В ходе разработки были опробованы варинты с weight_decay > 0 и weight_decay = 0. Получено, что задание weight_decay = 1e-5 улучшает valid-метрику на 2-3%.

8. Лучше всего на test датасете показали себя модели LSTM-bidirectional и GRU-undirectional.

9. Для генерации submit применялся ансамбль из двух моделей, показавших лучший результат на test датасете. Обе модели перед предсказанием submit обучались на полном датасете, без разделения на train, valid, test. Полученные на submit-датасете предсказания осреднялись. Обучение каждой из моделей на полном датасете заняло 3-4мин на эпоху.

10. Общие выводы: лучше всего показала себя LSTM-bidirectional модель. Лемматизация в случае RNN может ухудшать метрику, нельзя включать ее по умолчанию, без анализа влияния на результат. Наличие/отсутствие регуляризации может существенно повлиять на результат.

11. Идея на будущее: в идеале можно было бы попробовать нарастить датасет с помощью аугментации, например через замену слов синонимами и перестановку слов, но сходу не получилось поднять библиотеки в kaggle.

## 1. Работа с данными

In [54]:
work_dir = '/kaggle/input/unit-3-nlp-txt-classification'

In [55]:
# Загружаем train датасет, удаляя колонку с номером

train_df = pd.read_csv(f'{work_dir}/train.csv')
train_df = train_df.drop(['Unnamed: 0'], axis=1)
train_df

Unnamed: 0,Text,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41154,Airline pilots offering to stock supermarket s...,Neutral
41155,Response to complaint not provided citing COVI...,Extremely Negative
41156,You know itÂs getting tough when @KameronWild...,Positive
41157,Is it wrong that the smell of hand sanitizer i...,Neutral


In [56]:
# Проверяем на наличие nan

nan_count = train_df.isna().sum().sum()
nan_count

5

In [57]:
# Удаляем записи с nan с снова проверяем на наличие. nan больше нет

fixed_train_df = train_df.dropna()
nan_count = fixed_train_df.isna().sum().sum()
nan_count

0

In [58]:
# Смотрим распределение по классам.
# Видим, что тексты по классам распределены неравномерно, это нужно будет учесть при разбиении на train, val и test датасеты

fixed_train_df['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7711
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [59]:
# Задаем словарь для нумерации классов и заменяем названия на номера

class_to_idx = {'Extremely Negative': 0,
                 'Negative': 1,
                 'Neutral': 2,
                 'Positive': 3,
                 'Extremely Positive': 4
                }

def change_labels(input_label):
    return class_to_idx[input_label]

fixed_train_df['Sentiment'] = fixed_train_df['Sentiment'].map(change_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fixed_train_df['Sentiment'] = fixed_train_df['Sentiment'].map(change_labels)


In [60]:
fixed_train_df['Sentiment'].value_counts()

Sentiment
3    11422
1     9917
2     7711
4     6624
0     5481
Name: count, dtype: int64

In [61]:
# Проверяем тексты на наличие emojis

import emoji

def extract_emojis(input_text):    
    return [match["emoji"] for word in input_text for match in emoji.emoji_list(word)]

In [62]:
# Видим, что emojis, которые могли бы повлиять на смысловую окраску в текстах нет. Найденные эмоджи можно удалить

text_emojis = fixed_train_df['Text'].map(extract_emojis)
text_emojis.explode().value_counts().nlargest(25)

Text
©    65
®     5
Name: count, dtype: int64

In [63]:
# Словарь для преобразования emoticons из https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py

EMOTICONS_EMO = {
    u":‑)":"Happy face or smiley",
    u":-))":"Very Happy face or smiley",
    u":-)))":"Very very Happy face or smiley",
    u":)":"Happy face or smiley",
    u":))":"Very Happy face or smiley",
    u":)))":"Very very Happy face or smiley",
    u":-]":"Happy face or smiley",
    u":]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-)":"Happy face smiley",
    u":o)":"Happy face smiley",
    u":-}":"Happy face smiley",
    u":}":"Happy face smiley",
    u":-)":"Happy face smiley",
    u":c)":"Happy face smiley",
    u":^)":"Happy face smiley",
    u"=]":"Happy face smiley",
    u"=)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B^D":"Laughing, big grin or laugh with glasses",
    u":-))":"Very happy",
    u":-(":"Frown, sad, angry or pouting",
    u":‑(":"Frown, sad, angry or pouting",
    u":(":"Frown, sad, angry or pouting",
    u":‑c":"Frown, sad, angry or pouting",
    u":c":"Frown, sad, angry or pouting",
    u":‑<":"Frown, sad, angry or pouting",
    u":<":"Frown, sad, angry or pouting",
    u":‑[":"Frown, sad, angry or pouting",
    u":[":"Frown, sad, angry or pouting",
    u":-||":"Frown, sad, angry or pouting",
    u">:[":"Frown, sad, angry or pouting",
    u":{":"Frown, sad, angry or pouting",
    u":@":"Frown, sad, angry or pouting",
    u">:(":"Frown, sad, angry or pouting",
    u":'‑(":"Crying",
    u":'(":"Crying",
    u":'‑)":"Tears of happiness",
    u":')":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-*":"Kiss",
    u":*":"Kiss",
    u":X":"Kiss",
    u";‑)":"Wink or smirk",
    u";)":"Wink or smirk",
    u"*-)":"Wink or smirk",
    u"*)":"Wink or smirk",
    u";‑]":"Wink or smirk",
    u";]":"Wink or smirk",
    u";^)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑|":"Straight face",
    u":|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑)":"Angel, saint or innocent",
    u"O:)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑)":"Angel, saint or innocent",
    u"0:)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;^)":"Angel, saint or innocent",
    u">:‑)":"Evil or devilish",
    u">:)":"Evil or devilish",
    u"}:‑)":"Evil or devilish",
    u"}:)":"Evil or devilish",
    u"3:‑)":"Evil or devilish",
    u"3:)":"Evil or devilish",
    u">;)":"Evil or devilish",
    u"|;‑)":"Cool",
    u"|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑)":"Party all night",
    u"%‑)":"Drunk or confused",
    u"%)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑|":"Dump",
    u"(>_<)":"Troubled",
    u"(>_<)>":"Troubled",
    u"(';')":"Baby",
    u"(^^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(^_^;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(-_-;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(~_~;) (・.・;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(-_-)zzz":"Sleeping",
    u"(^_-)":"Wink",
    u"((+_+))":"Confused",
    u"(+o+)":"Confused",
    u"(o|o)":"Ultraman",
    u"^_^":"Joyful",
    u"(^_^)/":"Joyful",
    u"(^O^)／":"Joyful",
    u"(^o^)／":"Joyful",
    u"(__)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_(._.)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<(_ _)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m(__)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m(__)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m(_ _)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"('_')":"Sad or Crying",
    u"(/_;)":"Sad or Crying",
    u"(T_T) (;_;)":"Sad or Crying",
    u"(;_;":"Sad of Crying",
    u"(;_:)":"Sad or Crying",
    u"(;O;)":"Sad or Crying",
    u"(:_;)":"Sad or Crying",
    u"(ToT)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q.Q":"Sad or Crying",
    u"T.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"(-.-)":"Shame",
    u"(-_-)":"Shame",
    u"(一一)":"Shame",
    u"(；一_一)":"Shame",
    u"(=_=)":"Tired",
    u"(=^·^=)":"cat",
    u"(=^··^=)":"cat",
    u"=_^= ":"cat",
    u"(..)":"Looking down",
    u"(._.)":"Looking down",
    u"^m^":"Giggling with hand covering mouth",
    u"(・・?":"Confusion",
    u"(?_?)":"Confusion",
    u">^_^<":"Normal Laugh",
    u"<^!^>":"Normal Laugh",
    u"^/^":"Normal Laugh",
    u"（*^_^*）" :"Normal Laugh",
    u"(^<^) (^.^)":"Normal Laugh",
    u"(^^)":"Normal Laugh",
    u"(^.^)":"Normal Laugh",
    u"(^_^.)":"Normal Laugh",
    u"(^_^)":"Normal Laugh",
    u"(^^)":"Normal Laugh",
    u"(^J^)":"Normal Laugh",
    u"(*^.^*)":"Normal Laugh",
    u"(^—^）":"Normal Laugh",
    u"(#^.^#)":"Normal Laugh",
    u"（^—^）":"Waving",
    u"(;_;)/~~~":"Waving",
    u"(^.^)/~~~":"Waving",
    u"(-_-)/~~~ ($··)/~~~":"Waving",
    u"(T_T)/~~~":"Waving",
    u"(ToT)/~~~":"Waving",
    u"(*^0^*)":"Excited",
    u"(*_*)":"Amazed",
    u"(*_*;":"Amazed",
    u"(+_+) (@_@)":"Amazed",
    u"(*^^)v":"Laughing,Cheerful",
    u"(^_^)v":"Laughing,Cheerful",
    u"((d[-_-]b))":"Headphones,Listening to music",
    u'(-"-)':"Worried",
    u"(ーー;)":"Worried",
    u"(^0_0^)":"Eyeglasses",
    u"(＾ｖ＾)":"Happy",
    u"(＾ｕ＾)":"Happy",
    u"(^)o(^)":"Happy",
    u"(^O^)":"Happy",
    u"(^o^)":"Happy",
    u")^o^(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o.O":"Surpised",
    u"(o.o)":"Surprised",
    u"oO":"Surprised",
    u"(*￣m￣)":"Dissatisfied",
    u"(‘A`)":"Snubbed or Deflated"
}

In [64]:
# Проверяем текст на наличие emoticons

import re

def extract_emoticons(input_text):
    emoticon_pattern = re.compile(re.escape(u'(' + u'|'.join(k for k in EMOTICONS_EMO) + u')'))
    return re.findall(emoticon_pattern, input_text)

In [65]:
# Видим, что emoticons, которые представлены в словаре, в тексте нет

text_emoticons = fixed_train_df['Text'].map(extract_emoticons)
text_emoticons.explode().value_counts().nlargest(25)

Series([], Name: count, dtype: int64)

In [66]:
# Импортируем словарь для преобразования слэнговых сокращений

import requests

chat_words_map_dict = {}
chat_words_list = []

response = requests.get("https://raw.githubusercontent.com/rishabhverma17/sms_slang_translator/master/slang.txt")

for line in response.text.split("\n"):
    if line != "" and "=" in line:
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

In [67]:
# Загружаем библиотеки для предобработки текста

import nltk
import subprocess

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [68]:
# Пишем функции для предобработки текста.
# За основу брал функции из статьи https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
# Как было показано выше, смысловых emojis в тексте нет, поэтому удаляем их
# Поскольку emoticon вносят вклад в эмоциональную окраску текста, а наша цель - классифицировать тексты
#    именно по эмоциональной окраске, я не стал их удалять, а заменил на слова
#    сделать корректировку правописания не получилось т.к. spellchecker не поднялся нормально на kaggle, постоянно выдает ошибки при импорте

# Итого препроцессинг состоит из следующих последовательных этапов
#    1. Переводим слова с нижний регистр  
#    2. Удаляем знаки препинания
#    3. Удаляем стоп-слова
#    4. Лемматизация текста. При 
#    5. Удаляем emojis
#    6. Удаляем куски url
#    7. Удаляем html тэги
#    8. Преобразовываем слэнговые сокращения

from tqdm import tqdm

from bs4 import BeautifulSoup

import string

import re

stopwords = nltk.corpus.stopwords.words('english')

lemmatizer = WordNetLemmatizer()

# Попытки подобрать параметры лемматизации, улучшающие результат
# Включал и выключал отдельные части речи, однако лучший результат все равно получен без лемматизации
# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
# wordnet_map = {"N":wordnet.NOUN}

def remove_punctuation(input_text):
    return " ".join([word for word in str(input_text).split() if word not in set(string.punctuation)])

def remove_stopwords(input_text):
    return " ".join([word for word in str(input_text).split() if word not in stopwords])

def lemmatize_words(input_text):
    pos_tagged_text = nltk.pos_tag(input_text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

def remove_emojis(input_text):
    return emoji.replace_emoji(input_text, replace='')

def remove_urls(input_text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', input_text)

def remove_html(input_text):
    return BeautifulSoup(input_text, "lxml").text

def chat_words_conversion(input_text):
    new_text = []
    for w in input_text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def process_text(input_text):
    processed_text = input_text.lower() # Переводим слова с нижний регистр  
    processed_text = remove_punctuation(processed_text) # Удаляем знаки препинания
    processed_text = remove_stopwords(processed_text) # Удаляем 
    # processed_text = lemmatize_words(processed_text) # Лемматизация текста. Отключена т.к. ухудшает результат
    processed_text = remove_emojis(processed_text) # Удаляем emojis
    processed_text = remove_urls(processed_text) # Удаляем куски url
    processed_text = remove_html(processed_text) # Удаляем html тэги
    processed_text = chat_words_conversion(processed_text) # Преобразовываем слэнговые сокращения
    return processed_text

tqdm.pandas()

fixed_train_df['Processed_text'] = fixed_train_df['Text'].progress_map(process_text)
fixed_train_df

  return BeautifulSoup(input_text, "lxml").text
100%|██████████| 41155/41155 [00:30<00:00, 1371.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fixed_train_df['Processed_text'] = fixed_train_df['Text'].progress_map(process_text)


Unnamed: 0,Text,Sentiment,Processed_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2,@menyrbie @phil_gahan @chrisitv
1,advice Talk to your neighbours family to excha...,3,advice talk neighbours family exchange phone n...
2,Coronavirus Australia: Woolworths to give elde...,3,coronavirus australia: woolworths give elderly...
3,My food stock is not the only one which is emp...,3,"food stock one empty... please, panic, enough ..."
4,"Me, ready to go at supermarket during the #COV...",0,"me, ready go supermarket #covid19 outbreak. i'..."
...,...,...,...
41154,Airline pilots offering to stock supermarket s...,2,airline pilots offering stock supermarket shel...
41155,Response to complaint not provided citing COVI...,0,response complaint provided citing covid-19 re...
41156,You know itÂs getting tough when @KameronWild...,3,know itâs getting tough @kameronwilds rationi...
41157,Is it wrong that the smell of hand sanitizer i...,2,wrong smell hand sanitizer starting turn on? #...


In [69]:
# Разделяем на train, valid и test выборки
# train используем для обучения модели, valid для валидации
# test модель не видит в процессе обучения, ее используем для сравнения моделей
# преобразуем в список полный датасет чтобы обучить на нем наилучшую модель для получения submit

from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

TEST_SPLIT_SIZE = 0.1
VALID_SPLIT_SIZE = 0.2

X_rem, X_test, y_rem, y_test = train_test_split(fixed_train_df['Processed_text'].tolist(),
                                                fixed_train_df['Sentiment'].tolist(),
                                                test_size=TEST_SPLIT_SIZE,
                                                shuffle= True,
                                                stratify=fixed_train_df['Sentiment'].tolist(),
                                                random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_rem,
                                                  y_rem,
                                                  test_size=VALID_SPLIT_SIZE/(1.0-TEST_SPLIT_SIZE),
                                                  shuffle= True,
                                                  stratify=y_rem,
                                                  random_state=RANDOM_SEED)

train_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))
valid_data = list(zip(X_val, y_val))

complete_data = list(zip(fixed_train_df['Processed_text'].tolist(), fixed_train_df['Sentiment'].tolist()))

In [70]:
# Проверяем, что разделение верное

print(f'fixed_train_df.shape: {fixed_train_df.shape[0]}')
print(f'train_data size: {len(train_data)}')
print(f'test_data size: {len(test_data)}')
print(f'valid_data size: {len(valid_data)}')
print(f'summ: {len(train_data)+len(test_data)+len(valid_data)}')
print(f'complete_data size: {len(complete_data)}')

fixed_train_df.shape: 41155
train_data size: 28808
test_data size: 4116
valid_data size: 8231
summ: 41155
complete_data size: 41155


## 2. Векторизация текстов

In [71]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from torchtext.vocab import build_vocab_from_iterator

import spacy

torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = get_tokenizer("basic_english")

In [72]:
# Пишем функцию для токенизации текста и составляем словарь

def get_tokens(datasets):
    for dataset in datasets:
        for text, sentiment in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(
    get_tokens([train_data, test_data, valid_data]),
    min_freq=1,
    specials=['<unk>'],
    special_first=True
)

vocab.set_default_index(vocab['<unk>'])

In [73]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [74]:
# Пишем функцию для векторизации батча.
# Векторы текстов приводим к одной длине, заполняя недостающие слова как '<unk>' с помощью pad_sequence

from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    label_list, text_list = [], []
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = text_pipeline(_text)
        text_list.append(processed_text)
    text_list = pad_sequence([torch.tensor(p) for p in text_list], batch_first=True)
    label_list = torch.tensor(label_list, dtype=torch.int32)
    return text_list.to(device), label_list.to(device)

In [75]:
# Создаем итераторы-загрузочники для каждого из датасетов

BATCH_SIZE = 64

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate_batch, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=collate_batch, shuffle=False)
test_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=collate_batch, shuffle=False)

In [76]:
# Проверяем работу итераторов

print('Train')
for batch in train_dataloader:
    print(f'Text matrix size: {batch[0].size()}')
    print(f'Target vector size: {batch[1].size()}')
    break
    
print('\nValid:')
for batch in valid_dataloader:
    print(f'Text matrix size: {batch[0].size()}')
    print(f'Target vector size: {batch[1].size()}')
    break
    
print('\nTest:')
for batch in test_dataloader:
    print(f'Text matrix size: {batch[0].size()}')
    print(f'Target vector size: {batch[1].size()}')
    break

Train
Text matrix size: torch.Size([64, 47])
Target vector size: torch.Size([64])

Valid:
Text matrix size: torch.Size([64, 45])
Target vector size: torch.Size([64])

Test:
Text matrix size: torch.Size([64, 45])
Target vector size: torch.Size([64])


## 3. Разработка и анализ моделей

In [77]:
# Задаем архитектуру LSTM модели

import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout_rate):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout_rate,
                            batch_first=True)
        
        self.linear = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, X_batch, length):
        embeddings = self.dropout(self.embedding(X_batch))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embeddings,
                                                            length.to('cpu'),
                                                            batch_first=True, 
                                                            enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
       
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1,:,:], hidden[-2,:,:]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        prediction = self.linear(hidden)
        
        return prediction

In [78]:
# Задаем гиперпараметры LSTM модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(class_to_idx)
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.4

lstm_model = LSTMClassifier(INPUT_DIM,
                            EMBEDDING_DIM,
                            HIDDEN_DIM,
                            OUTPUT_DIM,
                            N_LAYERS,
                            BIDIRECTIONAL,
                            DROPOUT)
lstm_model = lstm_model.to(device)

In [79]:
# Задаем гиперпараметры LSTM-Bidirectional модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(class_to_idx)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.4

lstm_bi_model = LSTMClassifier(INPUT_DIM,
                               EMBEDDING_DIM,
                               HIDDEN_DIM,
                               OUTPUT_DIM,
                               N_LAYERS,
                               BIDIRECTIONAL,
                               DROPOUT)
lstm_bi_model = lstm_bi_model.to(device)

In [80]:
# Задаем архитектуру GRU модели

from torch import nn
from torch.nn import functional as F

class GRUClassifier(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout_rate):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim,
                          hidden_size=hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=dropout_rate,
                          batch_first=True)
        self.linear = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        # self.linear = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

    def forward(self, X_batch, length):
        embeddings = self.dropout(self.embedding(X_batch))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embeddings,
                                                            length.to('cpu'),
                                                            batch_first=True, 
                                                            enforce_sorted=False)
        
        packed_output, hidden = self.gru(packed_embedded)
       
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        #if self.gru.bidirectional:
         #   last_tensor = self.dropout(torch.cat([output[-1,:,:], output[-2,:,:]], dim=-1))
        #else:
        last_tensor = self.dropout(output[-1,:,:])
            
        prediction = self.linear(last_tensor)
        
        return prediction

In [81]:
# Задаем гиперпараметры GRU модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.4

gru_model = GRUClassifier(INPUT_DIM,
                          EMBEDDING_DIM,
                          HIDDEN_DIM,
                          OUTPUT_DIM,
                          N_LAYERS,
                          BIDIRECTIONAL,
                          DROPOUT)
gru_model = gru_model.to(device)

In [82]:
# Задаем гиперпараметры GRU-Bidirectional модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.4

gru_bi_model = GRUClassifier(INPUT_DIM,
                             EMBEDDING_DIM,
                             HIDDEN_DIM,
                             OUTPUT_DIM,
                             N_LAYERS,
                             BIDIRECTIONAL,
                             DROPOUT)
gru_bi_model = gru_bi_model.to(device)

In [83]:
# Пишем функцию для обучения модели

def train(model, iterator, optimizer, scheduler, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        labels = batch[1].type(torch.LongTensor).to(device)
        
        text, text_lengths = batch[0].to(device), torch.tensor([len(batch[0][0])] * len(batch[0])).to(device)
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, labels)      
        
        optimizer.zero_grad()
               
        loss.backward()
        
        optimizer.step()
        
        scheduler.step()
        
        y_pred_class = torch.argmax(torch.softmax(predictions, dim=1), dim=1)

        epoch_loss += loss.item()
        epoch_acc += (y_pred_class==labels).sum().item() / len(predictions)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [84]:
# Пишем функцию для валидации модели

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            labels = batch[1].type(torch.LongTensor).to(device)

            text, text_lengths = batch[0].to(device), torch.tensor([len(batch[0][0])] * len(batch[0])).to(device)
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, labels)
            
            y_pred_class = torch.argmax(torch.softmax(predictions, dim=1), dim=1)

            epoch_loss += loss.item()
            epoch_acc += (y_pred_class==labels).sum().item() / len(predictions)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [85]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train_model(train_dataloader,
                model,
                model_name,
                optimizer,
                scheduler,
                criterion,
                num_epochs
               ):
    best_valid_loss = float('inf')
    
    for epoch in range(num_epochs):
        start_time = time.time()
        
        train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler, criterion)
        valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
        
        end_time = time.time()
        
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f"best-model_{model_name}.pt")
            
        print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
        print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
        print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

In [86]:
N_EPOCHS = 20

In [87]:
# Задаем оптимизатор, шедулер, устанавливаем weight_decay > 0 для L2-регуляризации в Adam

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

learning_rate = 1e-3
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate, weight_decay=1e-5)
lstm_scheduler = lr_scheduler.LinearLR(lstm_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [88]:
# Задаем функцию потерь для LSTM модели, переносим на device, обучаем модель

lstm_criterion = nn.CrossEntropyLoss()
lstm_criterion = lstm_criterion.to(device)

train_model(train_dataloader,
            lstm_model,
            'lstm',
            lstm_optimizer,
            lstm_scheduler,
            lstm_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 1.565 | Train Acc: 27.85%
	 Val. Loss: 1.531 |  Val. Acc: 31.01%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 1.514 | Train Acc: 31.16%
	 Val. Loss: 1.504 |  Val. Acc: 32.29%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 1.483 | Train Acc: 32.54%
	 Val. Loss: 1.454 |  Val. Acc: 33.87%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 1.435 | Train Acc: 35.30%
	 Val. Loss: 1.410 |  Val. Acc: 37.16%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: 1.389 | Train Acc: 38.26%
	 Val. Loss: 1.354 |  Val. Acc: 40.49%
Epoch: 06 | Epoch Time: 0m 8s
	Train Loss: 1.293 | Train Acc: 43.80%
	 Val. Loss: 1.222 |  Val. Acc: 48.80%
Epoch: 07 | Epoch Time: 0m 8s
	Train Loss: 1.189 | Train Acc: 49.87%
	 Val. Loss: 1.122 |  Val. Acc: 54.06%
Epoch: 08 | Epoch Time: 0m 8s
	Train Loss: 1.103 | Train Acc: 54.27%
	 Val. Loss: 1.061 |  Val. Acc: 56.97%
Epoch: 09 | Epoch Time: 0m 8s
	Train Loss: 1.028 | Train Acc: 57.91%
	 Val. Loss: 0.994 |  Val. Acc: 61.08%
Epoch: 10 | Epoch Time: 0m 7

In [89]:
# Задаем оптимизатор, шедулер, устанавливаем weight_decay > 0 для L2-регуляризации в Adam

learning_rate = 1e-3
lstm_bi_optimizer = optim.Adam(lstm_bi_model.parameters(), lr=learning_rate, weight_decay=1e-5)
lstm_bi_scheduler = lr_scheduler.LinearLR(lstm_bi_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [90]:
# Задаем функцию потерь для LSTM-Bidirectional модели, переносим на device, обучаем модель

lstm_bi_criterion = nn.CrossEntropyLoss()
lstm_bi_criterion = lstm_bi_criterion.to(device)

train_model(train_dataloader,
            lstm_bi_model,
            'lstm_bi',
            lstm_bi_optimizer,
            lstm_bi_scheduler,
            lstm_bi_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 1.478 | Train Acc: 33.47%
	 Val. Loss: 1.372 |  Val. Acc: 39.70%
Epoch: 02 | Epoch Time: 0m 12s
	Train Loss: 1.345 | Train Acc: 40.98%
	 Val. Loss: 1.268 |  Val. Acc: 46.11%
Epoch: 03 | Epoch Time: 0m 13s
	Train Loss: 1.248 | Train Acc: 46.60%
	 Val. Loss: 1.225 |  Val. Acc: 48.59%
Epoch: 04 | Epoch Time: 0m 13s
	Train Loss: 1.163 | Train Acc: 50.83%
	 Val. Loss: 1.165 |  Val. Acc: 50.96%
Epoch: 05 | Epoch Time: 0m 13s
	Train Loss: 1.093 | Train Acc: 54.44%
	 Val. Loss: 1.119 |  Val. Acc: 53.53%
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 1.020 | Train Acc: 58.25%
	 Val. Loss: 1.082 |  Val. Acc: 57.48%
Epoch: 07 | Epoch Time: 0m 13s
	Train Loss: 0.963 | Train Acc: 60.76%
	 Val. Loss: 1.082 |  Val. Acc: 59.21%
Epoch: 08 | Epoch Time: 0m 13s
	Train Loss: 0.913 | Train Acc: 63.64%
	 Val. Loss: 0.988 |  Val. Acc: 62.38%
Epoch: 09 | Epoch Time: 0m 13s
	Train Loss: 0.859 | Train Acc: 66.00%
	 Val. Loss: 0.890 |  Val. Acc: 66.08%
Epoch: 10 | Epoch T

In [91]:
# Задаем оптимизатор, шедулер, устанавливаем weight_decay > 0 для L2-регуляризации в Adam

learning_rate = 1e-3
gru_optimizer = optim.Adam(gru_model.parameters(), lr=learning_rate, weight_decay=1e-5)
gru_scheduler = lr_scheduler.LinearLR(gru_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [92]:
# Задаем функцию потерь для GRU модели, переносим на device, обучаем модель

gru_criterion = nn.CrossEntropyLoss()
gru_criterion = gru_criterion.to(device)

train_model(train_dataloader,
            gru_model,
            'gru',
            gru_optimizer,
            gru_scheduler,
            gru_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 1.567 | Train Acc: 27.29%
	 Val. Loss: 1.559 |  Val. Acc: 29.20%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 1.509 | Train Acc: 30.66%
	 Val. Loss: 1.468 |  Val. Acc: 33.96%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 1.394 | Train Acc: 38.00%
	 Val. Loss: 1.279 |  Val. Acc: 46.01%
Epoch: 04 | Epoch Time: 0m 7s
	Train Loss: 1.247 | Train Acc: 46.92%
	 Val. Loss: 1.168 |  Val. Acc: 52.04%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 1.147 | Train Acc: 52.04%
	 Val. Loss: 1.078 |  Val. Acc: 56.44%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 1.048 | Train Acc: 57.16%
	 Val. Loss: 1.035 |  Val. Acc: 58.23%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 0.970 | Train Acc: 60.88%
	 Val. Loss: 0.946 |  Val. Acc: 63.39%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 0.895 | Train Acc: 64.79%
	 Val. Loss: 0.923 |  Val. Acc: 65.17%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.827 | Train Acc: 67.74%
	 Val. Loss: 0.864 |  Val. Acc: 68.04%
Epoch: 10 | Epoch Time: 0m 7

In [93]:
# Задаем оптимизатор, шедулер, устанавливаем weight_decay > 0 для L2-регуляризации в Adam

learning_rate = 1e-3
gru_bi_optimizer = optim.Adam(gru_bi_model.parameters(), lr=learning_rate, weight_decay=1e-5)
gru_bi_scheduler = lr_scheduler.LinearLR(gru_bi_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [94]:
# Задаем функцию потерь для GRU-Bidirectional модели, переносим на device, обучаем модель

gru_bi_criterion = nn.CrossEntropyLoss()
gru_bi_criterion = gru_bi_criterion.to(device)

train_model(train_dataloader,
            gru_bi_model,
            'gru_bi',
            gru_bi_optimizer,
            gru_bi_scheduler,
            gru_bi_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 1.569 | Train Acc: 27.33%
	 Val. Loss: 1.520 |  Val. Acc: 31.34%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 1.506 | Train Acc: 31.31%
	 Val. Loss: 1.447 |  Val. Acc: 35.27%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 1.364 | Train Acc: 39.65%
	 Val. Loss: 1.263 |  Val. Acc: 45.11%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 1.237 | Train Acc: 46.77%
	 Val. Loss: 1.171 |  Val. Acc: 50.23%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 1.129 | Train Acc: 53.08%
	 Val. Loss: 1.055 |  Val. Acc: 57.72%
Epoch: 06 | Epoch Time: 0m 10s
	Train Loss: 1.035 | Train Acc: 57.42%
	 Val. Loss: 1.001 |  Val. Acc: 60.32%
Epoch: 07 | Epoch Time: 0m 10s
	Train Loss: 0.954 | Train Acc: 61.38%
	 Val. Loss: 0.932 |  Val. Acc: 63.34%
Epoch: 08 | Epoch Time: 0m 11s
	Train Loss: 0.885 | Train Acc: 64.82%
	 Val. Loss: 0.910 |  Val. Acc: 65.00%
Epoch: 09 | Epoch Time: 0m 10s
	Train Loss: 0.820 | Train Acc: 67.89%
	 Val. Loss: 0.842 |  Val. Acc: 68.27%
Epoch: 10 | Epoch T

In [95]:
# Создаем словарь для сравнения моделей

models_comparison = {}

In [96]:
# Загружаем лучшую версию LSTM модели и оцениваем ее на test датасете

lstm_model.load_state_dict(torch.load('best-model_lstm.pt'))

test_loss, test_acc = evaluate(lstm_model, test_dataloader, lstm_criterion)

models_comparison['lstm'] = {'Test Loss': test_loss, 'Test Acc': test_acc}

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.787 | Test Acc: 73.34%


In [97]:
# Загружаем лучшую версию LSTM-Bidirectional модели и оцениваем ее на test датасете

lstm_bi_model.load_state_dict(torch.load('best-model_lstm_bi.pt'))

test_loss, test_acc = evaluate(lstm_bi_model, test_dataloader, lstm_bi_criterion)

models_comparison['lstm_bi'] = {'Test Loss': test_loss, 'Test Acc': test_acc}

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.824 | Test Acc: 73.90%


In [98]:
# Загружаем лучшую версию GRU модели и оцениваем ее на test датасете

gru_model.load_state_dict(torch.load('best-model_gru.pt'))

test_loss, test_acc = evaluate(gru_model, test_dataloader, gru_criterion)

models_comparison['gru'] = {'Test Loss': test_loss, 'Test Acc': test_acc}

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.796 | Test Acc: 73.45%


In [99]:
# Загружаем лучшую версию GRU-Bidirectional модели и оцениваем ее на test датасете

gru_bi_model.load_state_dict(torch.load('best-model_gru_bi.pt'))

test_loss, test_acc = evaluate(gru_bi_model, test_dataloader, gru_bi_criterion)

models_comparison['gru_bi'] = {'Test Loss': test_loss, 'Test Acc': test_acc}

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.785 | Test Acc: 73.29%


In [100]:
pd.DataFrame.from_dict(models_comparison, orient='index')

Unnamed: 0,Test Loss,Test Acc
lstm,0.787253,0.73339
lstm_bi,0.823891,0.739006
gru,0.795801,0.734524
gru_bi,0.785351,0.732906


## 4. Получение результата для submit

In [109]:
# Создаем итераторы-загрузочники для полного датасета и проверяем работу итератора

BATCH_SIZE = 64

complete_dataloader = DataLoader(complete_data, batch_size=BATCH_SIZE, collate_fn=collate_batch, shuffle=True)

print('Complete')
for batch in complete_dataloader:
    print(f'Text matrix size: {batch[0].size()}')
    print(f'Target vector size: {batch[1].size()}')
    break

Complete
Text matrix size: torch.Size([64, 39])
Target vector size: torch.Size([64])


In [110]:
# Задаем гиперпараметры LSTM-Bidirectional модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(class_to_idx)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.4

BM1_model = LSTMClassifier(INPUT_DIM,
                           EMBEDDING_DIM,
                           HIDDEN_DIM,
                           OUTPUT_DIM,
                           N_LAYERS,
                           BIDIRECTIONAL,
                           DROPOUT)
BM1_model = BM1_model.to(device)

In [111]:
# Задаем оптимизатор и функцию потерь для LSTM-Bidirectional модели, переносим на device, обучаем модель
# В данном случае значения Val. Acc не имеют никакого смысла т.к. valid датасет входит в состав полного датасета, на котором обучается модель

learning_rate = 1e-3
BM1_optimizer = optim.Adam(BM1_model.parameters(), lr=learning_rate, weight_decay=1e-5)
BM1_scheduler = lr_scheduler.LinearLR(BM1_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [112]:
BM1_criterion = nn.CrossEntropyLoss()
BM1_criterion = lstm_bi_criterion.to(device)

train_model(complete_dataloader,
            BM1_model,
            'BM1',
            BM1_optimizer,
            BM1_scheduler,
            BM1_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 1.460 | Train Acc: 34.09%
	 Val. Loss: 1.339 |  Val. Acc: 41.72%
Epoch: 02 | Epoch Time: 0m 18s
	Train Loss: 1.305 | Train Acc: 43.52%
	 Val. Loss: 1.154 |  Val. Acc: 51.78%
Epoch: 03 | Epoch Time: 0m 18s
	Train Loss: 1.184 | Train Acc: 50.04%
	 Val. Loss: 1.039 |  Val. Acc: 58.41%
Epoch: 04 | Epoch Time: 0m 18s
	Train Loss: 1.081 | Train Acc: 55.45%
	 Val. Loss: 0.974 |  Val. Acc: 61.00%
Epoch: 05 | Epoch Time: 0m 18s
	Train Loss: 0.983 | Train Acc: 60.39%
	 Val. Loss: 0.866 |  Val. Acc: 66.01%
Epoch: 06 | Epoch Time: 0m 18s
	Train Loss: 0.907 | Train Acc: 63.99%
	 Val. Loss: 0.747 |  Val. Acc: 71.88%
Epoch: 07 | Epoch Time: 0m 18s
	Train Loss: 0.837 | Train Acc: 67.49%
	 Val. Loss: 0.650 |  Val. Acc: 75.53%
Epoch: 08 | Epoch Time: 0m 18s
	Train Loss: 0.774 | Train Acc: 70.34%
	 Val. Loss: 0.602 |  Val. Acc: 77.92%
Epoch: 09 | Epoch Time: 0m 18s
	Train Loss: 0.716 | Train Acc: 72.68%
	 Val. Loss: 0.533 |  Val. Acc: 80.72%
Epoch: 10 | Epoch T

In [113]:
# Загружаем лучшую версию LSTM-Bidirectional модели

BM1_model.load_state_dict(torch.load('best-model_BM1.pt'))

<All keys matched successfully>

In [114]:
# Задаем гиперпараметры GRU модели, создаем модель и переносим ее на device

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(class_to_idx)
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.4

BM2_model = GRUClassifier(INPUT_DIM,
                           EMBEDDING_DIM,
                           HIDDEN_DIM,
                           OUTPUT_DIM,
                           N_LAYERS,
                           BIDIRECTIONAL,
                           DROPOUT)
BM2_model = BM2_model.to(device)

In [115]:
# Задаем оптимизатор и функцию потерь для GRU модели, переносим на device, обучаем модель
# В данном случае значения Val. Acc не имеют никакого смысла т.к. valid датасет входит в состав полного датасета, на котором обучается модель

learning_rate = 1e-3
BM2_optimizer = optim.Adam(BM2_model.parameters(), lr=learning_rate, weight_decay=1e-5)
BM2_scheduler = lr_scheduler.LinearLR(BM2_optimizer, start_factor=1.0, end_factor=0.5, total_iters=30)

In [116]:
BM2_criterion = nn.CrossEntropyLoss()
BM2_criterion = lstm_bi_criterion.to(device)

train_model(complete_dataloader,
            BM2_model,
            'BM2',
            BM2_optimizer,
            BM2_scheduler,
            BM2_criterion,
            N_EPOCHS
           )

Epoch: 01 | Epoch Time: 0m 9s
	Train Loss: 1.558 | Train Acc: 27.85%
	 Val. Loss: 1.502 |  Val. Acc: 30.61%
Epoch: 02 | Epoch Time: 0m 9s
	Train Loss: 1.431 | Train Acc: 35.74%
	 Val. Loss: 1.282 |  Val. Acc: 43.51%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 1.256 | Train Acc: 46.06%
	 Val. Loss: 1.110 |  Val. Acc: 53.73%
Epoch: 04 | Epoch Time: 0m 9s
	Train Loss: 1.128 | Train Acc: 52.91%
	 Val. Loss: 0.944 |  Val. Acc: 62.59%
Epoch: 05 | Epoch Time: 0m 9s
	Train Loss: 1.010 | Train Acc: 59.31%
	 Val. Loss: 0.844 |  Val. Acc: 67.34%
Epoch: 06 | Epoch Time: 0m 10s
	Train Loss: 0.914 | Train Acc: 63.41%
	 Val. Loss: 0.747 |  Val. Acc: 71.90%
Epoch: 07 | Epoch Time: 0m 9s
	Train Loss: 0.823 | Train Acc: 67.83%
	 Val. Loss: 0.649 |  Val. Acc: 76.73%
Epoch: 08 | Epoch Time: 0m 9s
	Train Loss: 0.749 | Train Acc: 71.27%
	 Val. Loss: 0.559 |  Val. Acc: 79.93%
Epoch: 09 | Epoch Time: 0m 10s
	Train Loss: 0.683 | Train Acc: 74.33%
	 Val. Loss: 0.502 |  Val. Acc: 82.69%
Epoch: 10 | Epoch Time: 0

In [117]:
# Загружаем лучшую версию LSTM-Bidirectional модели

BM2_model.load_state_dict(torch.load('best-model_BM2.pt'))

<All keys matched successfully>

In [118]:
# Загружаем тексты для test датасета

test_df = pd.read_csv(f'{work_dir}/test.csv')
test_df

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Meanwhile In A Supermarket in Israel -- People...
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Did you panic buy a lot of non-perishable item...
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Asst Prof of Economics @cconces was on @NBCPhi...
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Gov need to do somethings instead of biar je r...


In [119]:
# Проверяем на наличие nan

nan_count = test_df.isna().sum().sum()
nan_count

0

In [120]:
# Предобрабатываем текст

test_df['Processed_text'] = test_df['Text'].map(process_text)
test_df['Processed_text']

  return BeautifulSoup(input_text, "lxml").text


0       trending: new yorkers encounter empty supermar...
1       find hand sanitizer fred meyer, turned #amazon...
2                   find protect loved ones #coronavirus.
3       #panic buying hits #newyork city anxious shopp...
4       #toiletpaper #dunnypaper #coronavirus #coronav...
                              ...                        
3793    meanwhile supermarket israel -- people dance s...
3794    panic buy lot non-perishable items? echo needs...
3795    asst prof economics @cconces @nbcphiladelphia ...
3796    gov need somethings instead biar je rakyat ass...
3797    @forestandpaper members committed safety emplo...
Name: Processed_text, Length: 3798, dtype: object

In [121]:
# Пишем функцию для генерации предсказания

def make_prediction(input_text, models):
    predictions = []
    for model in models:
        model.eval()
        processed_text = text_pipeline(input_text)
        processed_text = torch.tensor(processed_text, dtype=torch.int64).to(device)
        prediction = model(processed_text.unsqueeze(0), torch.tensor([len(processed_text)]).to(device))
        predictions.append(prediction)
    mean_prediction = torch.mean(torch.stack(predictions), 0)
    return torch.argmax(torch.softmax(mean_prediction, dim=1)).item()

In [122]:
models = [BM1_model, BM2_model]

test_df['pred'] = test_df['Processed_text'].apply(lambda x: make_prediction(x, models=models))
test_df

Unnamed: 0,id,Text,Processed_text,pred
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...,trending: new yorkers encounter empty supermar...,1
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...,"find hand sanitizer fred meyer, turned #amazon...",3
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...,find protect loved ones #coronavirus.,4
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...,#panic buying hits #newyork city anxious shopp...,2
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...,#toiletpaper #dunnypaper #coronavirus #coronav...,1
...,...,...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Meanwhile In A Supermarket in Israel -- People...,meanwhile supermarket israel -- people dance s...,3
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Did you panic buy a lot of non-perishable item...,panic buy lot non-perishable items? echo needs...,1
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Asst Prof of Economics @cconces was on @NBCPhi...,asst prof economics @cconces @nbcphiladelphia ...,2
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Gov need to do somethings instead of biar je r...,gov need somethings instead biar je rakyat ass...,0


In [123]:
# Генерируем датафрейм для сохранения

idx_to_class = {0: 'Extremely Negative',
                1: 'Negative',
                2: 'Neutral',
                3: 'Positive',
                4: 'Extremely Positive'
                }

def convert_idx_to_class(input_label):
    return idx_to_class[input_label]

res_df = pd.DataFrame()
res_df['id'] = test_df['id']
res_df['Sentiment'] = test_df['pred'].map(convert_idx_to_class)
res_df

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Neutral
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Negative
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Positive
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Negative
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Neutral
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Extremely Negative


In [124]:
# Сохраняем в csv

res_df.to_csv('submission.csv', index=False)

In [125]:
# Проверяем формат сохраненного файла

pd.read_csv('submission.csv')

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Neutral
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Negative
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Positive
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Negative
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Neutral
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Extremely Negative
