In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unit-3-nlp-txt-classification/sample_submission.csv
/kaggle/input/unit-3-nlp-txt-classification/train.csv
/kaggle/input/unit-3-nlp-txt-classification/test.csv


In [2]:
# Устанавливаем библиотеку для оценки метрики результата

!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


**Общее описание решения и вывод:**

1. В данной части работы, с использованием Transformers, применялись 2 модели: bert-base-uncased и roberta-base. bert-base-uncased показала лучший результат

2. Поскольку в задании указано, что нужно сравнить RNN и Trasformers, для одной из моделей (bert-base-uncased) был рассмотрен вариант, в котором часть с загрузкой и предобработкой данных оставалась неизменной. 

3. Кроме того, пробовал менять предобработку для достижения максимального результата, в результате наилучшее значение метрики на kaggle было достигнуто при минимальной преобработке: перевод всех символов в нижний регистр и удаление знаков препинания. Это отличается от результатов аналогичного анализа для RNN, где было получено, что только лемматизация ухудшает результат, остальные шаги по преобработке улучшают его. Возможно это связано с тем, что модели transformers могут вытаскивать дополнительную информацию об эмоциональной окраске текстов из emoji, слэнговых сокращений и т.д. (то, что удалялось в ходе предобработки для RNN)

4. Для обеих моделей (bert-base-uncased и roberta-base) был выполнен подбор гиперпараметров

5. При подборе гиперпараметров модель обучалась на train-val датасетах на 3 эпохах, перебирались разные значения batch_size и weight_decay, в качестве критерия для сравнения моделей использовался accuracy на val датасете

6. Ниже представлена таблица с лучшими, по результатам обучения, значениями гиперпараметров. Несмотря на то, что для bert-base-uncased лучший результат при обучении был получен для weight_decay=0.001, при weight_decay=0.01 результат на submit оказался выше.

| Model | batch_size | weight_decay |
| --- | --- | --- |
| bert-base-uncased | 16 | 0.001 |
| roberta-base | 16 | 0.01 |

7. Пробовал задавать lr_scheduler отличный от установленного в Train по умолчанию, но он не дает сильного улучшения в результате и скорости обучения, зато не всегда стабильно работает.

8. Для генерации submit обе модели обучались на полном датасете с лучшим набором гиперпараметров, без разделения на train, valid, test.

9. Ниже представлена сравнительная таблица с лучшими результатами на Kaggle для каждой из моделей

| Model | Preprocessing | Best kaggle score |
| --- | --- | --- |
| RNN | from RNN | 0.73354 |
| bert-base-uncased | from RNN | 0.77909 |
| bert-base-uncased | New | 0.87835 |
| roberta-base | New | 0.85939 |

10. Общий вывод: применение Transformers дало лучший результат в сравнении с RNN для текстов с аналогичной предобработкой. Дополнительная настройка предобработки текстов под модели Transformers позволила значительно улучшить результат. Дополнительно, в качестве преимущества Transformers, можно отметить, что предобработка текста требует значительно меньшего количества применяемых к тексту действий, а значит, занимает меньше расчетного времени, в сравнении с предобработкой для RNN.

11. Что можно улучшить: попробовать другие модели Transformers, возможно cased и без перевода слов в нижний регистр при предобработке. В этом случае возможно модель сможет вытащить дополнительную эмоциональную окраску из текстов, с учетом заглавных букв (например, слов написаных капсом).

In [3]:
work_dir = '/kaggle/input/unit-3-nlp-txt-classification'

In [4]:
# Загружаем train датасет, удаляя колонку с номером

train_df = pd.read_csv(f'{work_dir}/train.csv')
train_df = train_df.drop(['Unnamed: 0'], axis=1)
train_df

Unnamed: 0,Text,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41154,Airline pilots offering to stock supermarket s...,Neutral
41155,Response to complaint not provided citing COVI...,Extremely Negative
41156,You know itÂs getting tough when @KameronWild...,Positive
41157,Is it wrong that the smell of hand sanitizer i...,Neutral


In [5]:
# Проверяем на наличие nan

nan_count = train_df.isna().sum().sum()
nan_count

5

In [6]:
# Удаляем записи с nan с снова проверяем на наличие. nan больше нет

fixed_train_df = train_df.dropna()
nan_count = fixed_train_df.isna().sum().sum()
nan_count

0

In [7]:
# Смотрим распределение по классам.
# Видим, что тексты по классам распределены неравномерно, это нужно будет учесть при разбиении на train, val и test датасеты

fixed_train_df['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7711
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [8]:
# Задаем словарь для нумерации классов и заменяем названия на номера

class_to_idx = {'Extremely Negative': 0,
                 'Negative': 1,
                 'Neutral': 2,
                 'Positive': 3,
                 'Extremely Positive': 4
                }

def change_labels(input_label):
    return class_to_idx[input_label]

fixed_train_df['Sentiment'] = fixed_train_df['Sentiment'].map(change_labels)

In [9]:
fixed_train_df['Sentiment'].value_counts()

Sentiment
3    11422
1     9917
2     7711
4     6624
0     5481
Name: count, dtype: int64

In [10]:
# Проверяем тексты на наличие emojis

import emoji

def extract_emojis(input_text):    
    return [match["emoji"] for word in input_text for match in emoji.emoji_list(word)]

In [11]:
# Видим, что emojis, которые могли бы повлиять на смысловую окраску в текстах нет. Найденные эмоджи можно удалить

text_emojis = fixed_train_df['Text'].map(extract_emojis)
text_emojis.explode().value_counts().nlargest(25)

Text
©    65
®     5
Name: count, dtype: int64

In [12]:
# Словарь для преобразования emoticons из https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py

EMOTICONS_EMO = {
    u":‑)":"Happy face or smiley",
    u":-))":"Very Happy face or smiley",
    u":-)))":"Very very Happy face or smiley",
    u":)":"Happy face or smiley",
    u":))":"Very Happy face or smiley",
    u":)))":"Very very Happy face or smiley",
    u":-]":"Happy face or smiley",
    u":]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-)":"Happy face smiley",
    u":o)":"Happy face smiley",
    u":-}":"Happy face smiley",
    u":}":"Happy face smiley",
    u":-)":"Happy face smiley",
    u":c)":"Happy face smiley",
    u":^)":"Happy face smiley",
    u"=]":"Happy face smiley",
    u"=)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B^D":"Laughing, big grin or laugh with glasses",
    u":-))":"Very happy",
    u":-(":"Frown, sad, angry or pouting",
    u":‑(":"Frown, sad, angry or pouting",
    u":(":"Frown, sad, angry or pouting",
    u":‑c":"Frown, sad, angry or pouting",
    u":c":"Frown, sad, angry or pouting",
    u":‑<":"Frown, sad, angry or pouting",
    u":<":"Frown, sad, angry or pouting",
    u":‑[":"Frown, sad, angry or pouting",
    u":[":"Frown, sad, angry or pouting",
    u":-||":"Frown, sad, angry or pouting",
    u">:[":"Frown, sad, angry or pouting",
    u":{":"Frown, sad, angry or pouting",
    u":@":"Frown, sad, angry or pouting",
    u">:(":"Frown, sad, angry or pouting",
    u":'‑(":"Crying",
    u":'(":"Crying",
    u":'‑)":"Tears of happiness",
    u":')":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-*":"Kiss",
    u":*":"Kiss",
    u":X":"Kiss",
    u";‑)":"Wink or smirk",
    u";)":"Wink or smirk",
    u"*-)":"Wink or smirk",
    u"*)":"Wink or smirk",
    u";‑]":"Wink or smirk",
    u";]":"Wink or smirk",
    u";^)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑|":"Straight face",
    u":|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑)":"Angel, saint or innocent",
    u"O:)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑)":"Angel, saint or innocent",
    u"0:)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;^)":"Angel, saint or innocent",
    u">:‑)":"Evil or devilish",
    u">:)":"Evil or devilish",
    u"}:‑)":"Evil or devilish",
    u"}:)":"Evil or devilish",
    u"3:‑)":"Evil or devilish",
    u"3:)":"Evil or devilish",
    u">;)":"Evil or devilish",
    u"|;‑)":"Cool",
    u"|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑)":"Party all night",
    u"%‑)":"Drunk or confused",
    u"%)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑|":"Dump",
    u"(>_<)":"Troubled",
    u"(>_<)>":"Troubled",
    u"(';')":"Baby",
    u"(^^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(^_^;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(-_-;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(~_~;) (・.・;)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"(-_-)zzz":"Sleeping",
    u"(^_-)":"Wink",
    u"((+_+))":"Confused",
    u"(+o+)":"Confused",
    u"(o|o)":"Ultraman",
    u"^_^":"Joyful",
    u"(^_^)/":"Joyful",
    u"(^O^)／":"Joyful",
    u"(^o^)／":"Joyful",
    u"(__)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_(._.)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<(_ _)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m(__)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m(__)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m(_ _)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"('_')":"Sad or Crying",
    u"(/_;)":"Sad or Crying",
    u"(T_T) (;_;)":"Sad or Crying",
    u"(;_;":"Sad of Crying",
    u"(;_:)":"Sad or Crying",
    u"(;O;)":"Sad or Crying",
    u"(:_;)":"Sad or Crying",
    u"(ToT)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q.Q":"Sad or Crying",
    u"T.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"(-.-)":"Shame",
    u"(-_-)":"Shame",
    u"(一一)":"Shame",
    u"(；一_一)":"Shame",
    u"(=_=)":"Tired",
    u"(=^·^=)":"cat",
    u"(=^··^=)":"cat",
    u"=_^= ":"cat",
    u"(..)":"Looking down",
    u"(._.)":"Looking down",
    u"^m^":"Giggling with hand covering mouth",
    u"(・・?":"Confusion",
    u"(?_?)":"Confusion",
    u">^_^<":"Normal Laugh",
    u"<^!^>":"Normal Laugh",
    u"^/^":"Normal Laugh",
    u"（*^_^*）" :"Normal Laugh",
    u"(^<^) (^.^)":"Normal Laugh",
    u"(^^)":"Normal Laugh",
    u"(^.^)":"Normal Laugh",
    u"(^_^.)":"Normal Laugh",
    u"(^_^)":"Normal Laugh",
    u"(^^)":"Normal Laugh",
    u"(^J^)":"Normal Laugh",
    u"(*^.^*)":"Normal Laugh",
    u"(^—^）":"Normal Laugh",
    u"(#^.^#)":"Normal Laugh",
    u"（^—^）":"Waving",
    u"(;_;)/~~~":"Waving",
    u"(^.^)/~~~":"Waving",
    u"(-_-)/~~~ ($··)/~~~":"Waving",
    u"(T_T)/~~~":"Waving",
    u"(ToT)/~~~":"Waving",
    u"(*^0^*)":"Excited",
    u"(*_*)":"Amazed",
    u"(*_*;":"Amazed",
    u"(+_+) (@_@)":"Amazed",
    u"(*^^)v":"Laughing,Cheerful",
    u"(^_^)v":"Laughing,Cheerful",
    u"((d[-_-]b))":"Headphones,Listening to music",
    u'(-"-)':"Worried",
    u"(ーー;)":"Worried",
    u"(^0_0^)":"Eyeglasses",
    u"(＾ｖ＾)":"Happy",
    u"(＾ｕ＾)":"Happy",
    u"(^)o(^)":"Happy",
    u"(^O^)":"Happy",
    u"(^o^)":"Happy",
    u")^o^(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o.O":"Surpised",
    u"(o.o)":"Surprised",
    u"oO":"Surprised",
    u"(*￣m￣)":"Dissatisfied",
    u"(‘A`)":"Snubbed or Deflated"
}

In [13]:
# Проверяем текст на наличие emoticons

import re

def extract_emoticons(input_text):
    emoticon_pattern = re.compile(re.escape(u'(' + u'|'.join(k for k in EMOTICONS_EMO) + u')'))
    return re.findall(emoticon_pattern, input_text)

In [14]:
# Видим, что emoticons, которые представлены в словаре, в тексте нет

text_emoticons = fixed_train_df['Text'].map(extract_emoticons)
text_emoticons.explode().value_counts().nlargest(25)

Series([], Name: count, dtype: int64)

In [15]:
# Импортируем словарь для преобразования слэнговых сокращений

import requests

chat_words_map_dict = {}
chat_words_list = []

response = requests.get("https://raw.githubusercontent.com/rishabhverma17/sms_slang_translator/master/slang.txt")

for line in response.text.split("\n"):
    if line != "" and "=" in line:
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

In [16]:
# Загружаем библиотеки для предобработки текста

import nltk
import subprocess

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet



[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
Archive:  /kaggle/working/corpora/wordnet.zip
   creating: /kaggle/working/corpora/wordnet/
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/c

In [17]:
# Пишем функции для предобработки текста.
# За основу брал функции из статьи https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
# Как было показано выше, смысловых emojis в тексте нет, поэтому удаляем их
# Поскольку emoticon вносят вклад в эмоциональную окраску текста, а наша цель - классифицировать тексты
#    именно по эмоциональной окраске, я не стал их удалять, а заменил на слова
#    сделать корректировку правописания не получилось т.к. spellchecker не поднялся нормально на kaggle, постоянно выдает ошибки при импорте

# Итого препроцессинг состоит из следующих последовательных этапов
#    1. Переводим слова с нижний регистр  
#    2. Удаляем знаки препинания
#    3. Удаляем стоп-слова
#    4. Лемматизация текста. При 
#    5. Удаляем emojis
#    6. Удаляем куски url
#    7. Удаляем html тэги
#    8. Преобразовываем слэнговые сокращения

from tqdm import tqdm

from bs4 import BeautifulSoup

import string

import re

stopwords = nltk.corpus.stopwords.words('english')

lemmatizer = WordNetLemmatizer()

# Попытки подобрать параметры лемматизации, улучшающие результат
# Включал и выключал отдельные части речи, однако лучший результат все равно получен без лемматизации
# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
# wordnet_map = {"N":wordnet.NOUN}

def remove_punctuation(input_text):
    return " ".join([word for word in str(input_text).split() if word not in set(string.punctuation)])

def remove_stopwords(input_text):
    return " ".join([word for word in str(input_text).split() if word not in stopwords])

def lemmatize_words(input_text):
    pos_tagged_text = nltk.pos_tag(input_text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

def remove_emojis(input_text):
    return emoji.replace_emoji(input_text, replace='')

def remove_urls(input_text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', input_text)

def remove_html(input_text):
    return BeautifulSoup(input_text, "lxml").text

def chat_words_conversion(input_text):
    new_text = []
    for w in input_text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def process_text(input_text):
    processed_text = input_text.lower() # Переводим слова с нижний регистр  
    processed_text = remove_punctuation(processed_text) # Удаляем знаки препинания
    #processed_text = remove_stopwords(processed_text) # Удаляем 
    # processed_text = lemmatize_words(processed_text) # Лемматизация текста. Отключена т.к. ухудшает результат
    #processed_text = remove_emojis(processed_text) # Удаляем emojis
    #processed_text = remove_urls(processed_text) # Удаляем куски url
    #processed_text = remove_html(processed_text) # Удаляем html тэги
    #processed_text = chat_words_conversion(processed_text) # Преобразовываем слэнговые сокращения
    return processed_text

tqdm.pandas()

fixed_train_df['Processed_text'] = fixed_train_df['Text'].progress_map(process_text)
fixed_train_df

100%|██████████| 41155/41155 [00:01<00:00, 32389.82it/s]


Unnamed: 0,Text,Sentiment,Processed_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2,@menyrbie @phil_gahan @chrisitv https://t.co/i...
1,advice Talk to your neighbours family to excha...,3,advice talk to your neighbours family to excha...
2,Coronavirus Australia: Woolworths to give elde...,3,coronavirus australia: woolworths to give elde...
3,My food stock is not the only one which is emp...,3,my food stock is not the only one which is emp...
4,"Me, ready to go at supermarket during the #COV...",0,"me, ready to go at supermarket during the #cov..."
...,...,...,...
41154,Airline pilots offering to stock supermarket s...,2,airline pilots offering to stock supermarket s...
41155,Response to complaint not provided citing COVI...,0,response to complaint not provided citing covi...
41156,You know itÂs getting tough when @KameronWild...,3,you know itâs getting tough when @kameronwild...
41157,Is it wrong that the smell of hand sanitizer i...,2,is it wrong that the smell of hand sanitizer i...


In [18]:
# Пишем функции для предобработки текста

import string

import nltk
import subprocess

nltk.download('wordnet')
nltk.download('punkt')

from nltk.tokenize import word_tokenize


def get_tokenized_text(input_text):
    return word_tokenize(input_text)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Смотрим максимальную длину токенизированного текста чтобы оценить, какое макс количество слов нужно будет задавать далее

def get_tokenized_text_length(input_text):
    return len(word_tokenize(input_text))

processed_text_len = fixed_train_df['Processed_text'].map(get_tokenized_text_length)
print(f"Max length of tokenized text: {processed_text_len.max()}")

Max length of tokenized text: 229


In [20]:
unprocessed_text_len = fixed_train_df['Text'].map(get_tokenized_text_length)
print(f"Max length of tokenized text: {processed_text_len.max()}")

Max length of tokenized text: 229


In [21]:
# Разделяем на train, valid и test выборки
# train используем для обучения модели, valid для валидации
# test модель не видит в процессе обучения, ее используем для сравнения моделей
# преобразуем в список полный датасет чтобы обучить на нем наилучшую модель для получения submit

from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

TEST_SPLIT_SIZE = 0.1
VALID_SPLIT_SIZE = 0.2

X_rem, X_test, y_rem, y_test = train_test_split(fixed_train_df['Processed_text'],
                                                fixed_train_df['Sentiment'].tolist(),
                                                test_size=TEST_SPLIT_SIZE,
                                                shuffle= True,
                                                stratify=fixed_train_df['Sentiment'].tolist(),
                                                random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_rem,
                                                  y_rem,
                                                  test_size=VALID_SPLIT_SIZE/(1.0-TEST_SPLIT_SIZE),
                                                  shuffle= True,
                                                  stratify=y_rem,
                                                  random_state=RANDOM_SEED)

train_data = pd.DataFrame(list(zip(X_train, y_train)), columns=['text', 'labels'])
test_data = pd.DataFrame(list(zip(X_test, y_test)), columns=['text', 'labels'])
valid_data = pd.DataFrame(list(zip(X_val, y_val)), columns=['text', 'labels'])

complete_data = pd.DataFrame(list(zip(fixed_train_df['Processed_text'].tolist(), fixed_train_df['Sentiment'].tolist())), columns=['text', 'labels'])

## 2. Создание датасетов

In [22]:
import datasets
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_data.reset_index(drop = True)),
    'val': Dataset.from_pandas(valid_data.reset_index(drop = True)),    
    'test': Dataset.from_pandas(test_data.reset_index(drop = True)),
    'complete': Dataset.from_pandas(complete_data.reset_index(drop = True)),
    })
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 28808
    })
    val: Dataset({
        features: ['text', 'labels'],
        num_rows: 8231
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 4116
    })
    complete: Dataset({
        features: ['text', 'labels'],
        num_rows: 41155
    })
})

In [23]:
import torch
from torch.utils.data import DataLoader

torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
from transformers import AutoTokenizer
model_id = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_id, do_lower_case = True, batched=True)

def encode_data(input_data):
    encoding = tokenizer(input_data['text'])
    return encoding

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [25]:
encoded_dataset = dataset.map(encode_data, batched = True)
encoded_dataset = encoded_dataset.remove_columns(['text'])
encoded_dataset.set_format("torch")
encoded_dataset

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/42 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 28808
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8231
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4116
    })
    complete: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 41155
    })
})

## 3. Обучение модели

In [26]:
# Задаем гиперпараметры

NUM_EPOCHS = 3
WEIGHT_DECAY_VALUES = (0.01, 0.001, 1e-4)
BATCH_SIZE_VALUES = (16, 32)

In [27]:
# Пишем функцию для оценки accuracy результата

import evaluate
from transformers import EvalPrediction

metric = evaluate.load("accuracy")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    metric_values = metric.compute(predictions=preds, references=p.label_ids)
    return {"accuracy": metric_values["accuracy"]}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [28]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

import wandb
wandb.login(key='xxx')

NUM_LABELS = len(class_to_idx)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [29]:
hyper_result = {'WEIGHT_DECAY': [],
                'BATCH_SIZE': [],
                'Test_acc': []
               }

for WEIGHT_DECAY in WEIGHT_DECAY_VALUES:
    for BATCH_SIZE in BATCH_SIZE_VALUES:
        model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels = NUM_LABELS)
        
        training_args = TrainingArguments(
            output_dir='/kaggle/working',
            overwrite_output_dir=True,
            learning_rate=5e-5,
            warmup_steps=500,
            weight_decay=WEIGHT_DECAY, # аналог L2-регуляризации для Adam
            do_train=True,
            do_eval=True,
            #load_best_model_at_end=True,
            group_by_length=True,            
            evaluation_strategy='epoch',
            save_strategy='no',
            report_to="none",
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,            
            num_train_epochs=NUM_EPOCHS
        )                   
        trainer = Trainer(            
            model=model,
            args=training_args,
            data_collator=DataCollatorWithPadding(tokenizer, padding=True),
            train_dataset=encoded_dataset['train'],
            eval_dataset=encoded_dataset['val'],
            compute_metrics=compute_metrics,
            tokenizer=tokenizer
        )
        train_result = trainer.train()
        test_result = trainer.predict(encoded_dataset['test'])
        
        print(f"Weight decay: {WEIGHT_DECAY}")
        print(f"Batch size: {BATCH_SIZE}")
        print(f"Train-Val metrics: {train_result.metrics}")
        print(f"Test metrics: {test_result.metrics}")
        
        hyper_result['WEIGHT_DECAY'].append(WEIGHT_DECAY)
        hyper_result['BATCH_SIZE'].append(BATCH_SIZE)
        hyper_result['Test_acc'].append(test_result.metrics['test_accuracy'])

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2333,0.569603,0.790062
2,0.4179,0.419273,0.852752
3,0.2262,0.376301,0.87948




Weight decay: 0.01
Batch size: 16
Train-Val metrics: {'train_runtime': 884.3774, 'train_samples_per_second': 97.723, 'train_steps_per_second': 3.056, 'total_flos': 2519134208160096.0, 'train_loss': 0.5436919420858335, 'epoch': 3.0}
Test metrics: {'test_loss': 0.392692893743515, 'test_accuracy': 0.8690476190476191, 'test_runtime': 16.3667, 'test_samples_per_second': 251.486, 'test_steps_per_second': 7.882}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.710774,0.724699
2,1.153600,0.438032,0.842789
3,0.466000,0.381241,0.869518




Weight decay: 0.01
Batch size: 32
Train-Val metrics: {'train_runtime': 683.0028, 'train_samples_per_second': 126.535, 'train_steps_per_second': 1.981, 'total_flos': 2542255714929120.0, 'train_loss': 0.6685206772042308, 'epoch': 3.0}
Test metrics: {'test_loss': 0.38911452889442444, 'test_accuracy': 0.8658892128279884, 'test_runtime': 15.5044, 'test_samples_per_second': 265.472, 'test_steps_per_second': 4.192}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2766,0.538144,0.801847
2,0.4134,0.411663,0.859798
3,0.2279,0.373467,0.881302




Weight decay: 0.001
Batch size: 16
Train-Val metrics: {'train_runtime': 875.7348, 'train_samples_per_second': 98.687, 'train_steps_per_second': 3.087, 'total_flos': 2519134208160096.0, 'train_loss': 0.5498915533643716, 'epoch': 3.0}
Test metrics: {'test_loss': 0.38582366704940796, 'test_accuracy': 0.8763362487852284, 'test_runtime': 16.4236, 'test_samples_per_second': 250.615, 'test_steps_per_second': 7.855}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.69576,0.733447
2,1.156700,0.469865,0.828332
3,0.467900,0.382605,0.867938




Weight decay: 0.001
Batch size: 32
Train-Val metrics: {'train_runtime': 682.2936, 'train_samples_per_second': 126.667, 'train_steps_per_second': 1.983, 'total_flos': 2542255714929120.0, 'train_loss': 0.6699940920404743, 'epoch': 3.0}
Test metrics: {'test_loss': 0.3928266763687134, 'test_accuracy': 0.8649173955296404, 'test_runtime': 15.4828, 'test_samples_per_second': 265.844, 'test_steps_per_second': 4.198}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2768,0.54054,0.801361
2,0.4165,0.433448,0.853359
3,0.2275,0.387429,0.877293




Weight decay: 0.0001
Batch size: 16
Train-Val metrics: {'train_runtime': 880.6745, 'train_samples_per_second': 98.134, 'train_steps_per_second': 3.069, 'total_flos': 2519134208160096.0, 'train_loss': 0.5495597424438341, 'epoch': 3.0}
Test metrics: {'test_loss': 0.4039320945739746, 'test_accuracy': 0.8646744412050534, 'test_runtime': 16.4515, 'test_samples_per_second': 250.19, 'test_steps_per_second': 7.841}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.691198,0.736484
2,1.156100,0.467101,0.830762
3,0.463300,0.380035,0.870247




Weight decay: 0.0001
Batch size: 32
Train-Val metrics: {'train_runtime': 682.6291, 'train_samples_per_second': 126.605, 'train_steps_per_second': 1.982, 'total_flos': 2542255714929120.0, 'train_loss': 0.6682669674125497, 'epoch': 3.0}
Test metrics: {'test_loss': 0.39337849617004395, 'test_accuracy': 0.8668610301263362, 'test_runtime': 15.5541, 'test_samples_per_second': 264.625, 'test_steps_per_second': 4.179}


In [30]:
pd.DataFrame.from_dict(hyper_result)

Unnamed: 0,WEIGHT_DECAY,BATCH_SIZE,Test_acc
0,0.01,16,0.869048
1,0.01,32,0.865889
2,0.001,16,0.876336
3,0.001,32,0.864917
4,0.0001,16,0.864674
5,0.0001,32,0.866861


## 4. Обучение на все данных и получение результата для submit

In [31]:
# Задаем гиперпараметры

NUM_EPOCHS = 5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 16

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels = NUM_LABELS)
        
training_args = TrainingArguments(
    output_dir='/kaggle/working',
    overwrite_output_dir=True,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY, # аналог L2-регуляризации для Adam
    load_best_model_at_end=True,
    group_by_length=True,            
    evaluation_strategy='epoch',
    save_strategy='epoch',
    report_to="none",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,            
    num_train_epochs=NUM_EPOCHS
)                   
trainer = Trainer(            
    model=model,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer, padding=True),
    train_dataset=encoded_dataset['complete'],
    eval_dataset=encoded_dataset['val'],
    tokenizer=tokenizer
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6874,0.43194
2,0.344,0.199518
3,0.2376,0.20346
4,0.1403,0.076914
5,0.083,0.044062




TrainOutput(global_step=6435, training_loss=0.33732818064063486, metrics={'train_runtime': 2030.1966, 'train_samples_per_second': 101.357, 'train_steps_per_second': 3.17, 'total_flos': 6016047295678980.0, 'train_loss': 0.33732818064063486, 'epoch': 5.0})

In [32]:
# Загружаем тексты для test датасета

test_df = pd.read_csv(f'{work_dir}/test.csv')
test_df

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Meanwhile In A Supermarket in Israel -- People...
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Did you panic buy a lot of non-perishable item...
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Asst Prof of Economics @cconces was on @NBCPhi...
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Gov need to do somethings instead of biar je r...


In [33]:
# Проверяем на наличие nan

nan_count = test_df.isna().sum().sum()
nan_count

0

In [34]:
# Предобрабатываем текст

test_df['text'] = test_df['Text'].map(process_text)
test_df['text']

0       trending: new yorkers encounter empty supermar...
1       when i couldn't find hand sanitizer at fred me...
2       find out how you can protect yourself and love...
3       #panic buying hits #newyork city as anxious sh...
4       #toiletpaper #dunnypaper #coronavirus #coronav...
                              ...                        
3793    meanwhile in a supermarket in israel -- people...
3794    did you panic buy a lot of non-perishable item...
3795    asst prof of economics @cconces was on @nbcphi...
3796    gov need to do somethings instead of biar je r...
3797    i and @forestandpaper members are committed to...
Name: text, Length: 3798, dtype: object

In [35]:
submit_df = test_df.drop(['id', 'Text'], axis=1)
submit_df

Unnamed: 0,text
0,trending: new yorkers encounter empty supermar...
1,when i couldn't find hand sanitizer at fred me...
2,find out how you can protect yourself and love...
3,#panic buying hits #newyork city as anxious sh...
4,#toiletpaper #dunnypaper #coronavirus #coronav...
...,...
3793,meanwhile in a supermarket in israel -- people...
3794,did you panic buy a lot of non-perishable item...
3795,asst prof of economics @cconces was on @nbcphi...
3796,gov need to do somethings instead of biar je r...


In [36]:
# Генерируем датасет

submit_dataset = DatasetDict({
    'submit': Dataset.from_pandas(submit_df.reset_index(drop = True))})
submit_dataset

DatasetDict({
    submit: Dataset({
        features: ['text'],
        num_rows: 3798
    })
})

In [37]:
encoded_dataset = submit_dataset.map(encode_data, batched = True)
encoded_dataset = encoded_dataset.remove_columns(['text'])
encoded_dataset.set_format("torch")
encoded_dataset

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    submit: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3798
    })
})

In [38]:
predictions = trainer.predict(encoded_dataset['submit'])

res_preds = []

for pred in predictions.predictions:
    res_preds.append(np.argmax(pred))

test_df['pred'] = res_preds

In [39]:
# Генерируем датафрейм для сохранения

idx_to_class = {0: 'Extremely Negative',
                1: 'Negative',
                2: 'Neutral',
                3: 'Positive',
                4: 'Extremely Positive'
                }

def convert_idx_to_class(input_label):
    return idx_to_class[input_label]

res_df = pd.DataFrame()
res_df['id'] = test_df['id']
res_df['Sentiment'] = test_df['pred'].map(convert_idx_to_class)
res_df

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Extremely Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Negative
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Neutral
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Positive
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Negative
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Neutral
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Extremely Negative


In [40]:
# Сохраняем в csv

res_df.to_csv('submission.csv', index=False)

In [41]:
# Проверяем формат сохраненного файла

pd.read_csv('submission.csv')

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Extremely Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Negative
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Neutral
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Positive
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Negative
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Neutral
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Extremely Negative
