In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score,classification_report

from tokenizers import BertWordPieceTokenizer

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [2]:
train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv',encoding='latin1')
test = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv',encoding='latin1')


In [3]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
train.describe()

Unnamed: 0,UserName,ScreenName
count,41157.0,41157.0
mean,24377.0,69329.0
std,11881.146851,11881.146851
min,3799.0,48751.0
25%,14088.0,59040.0
50%,24377.0,69329.0
75%,34666.0,79618.0
max,44955.0,89907.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [6]:
train.groupby('Sentiment')['Sentiment'].count()

Sentiment
Extremely Negative     5481
Extremely Positive     6624
Negative               9917
Neutral                7713
Positive              11422
Name: Sentiment, dtype: int64

In [7]:
label = train['Sentiment'].unique()
label

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [8]:
label_dict = {'Neutral':0,'Positive':1, 'Extremely Positive':2, 'Negative': 3, 'Extremely Negative': 4}
label_dict

{'Neutral': 0,
 'Positive': 1,
 'Extremely Positive': 2,
 'Negative': 3,
 'Extremely Negative': 4}

In [9]:
train['Sentiment'] = train['Sentiment'].map(label_dict)
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",4


In [10]:
X = train['OriginalTweet'].values
y=train['Sentiment'].values
#y = pd.get_dummies(y,drop_first=True)

Stratified with Sentiment values (target) as there is imbalance in the dataset for the different classes, so we want a 80:20 distribution of every class in train, test datasets.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0,stratify=train['Sentiment'])

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [13]:
type(X_train.tolist())

list

In [14]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
    #y_train.idxmax(axis=1)
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
    #y_test.idxmax(axis=1)
))

In [16]:
test_dataset

<TensorSliceDataset shapes: ({input_ids: (245,), attention_mask: (245,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int64)>

In [17]:
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [18]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

[34m[1mwandb[0m: Paste an API key from your profile and hit enter:  


[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.


In [20]:
y_pred = trainer.predict(test_dataset).label_ids
y_test1 = y_test

confusion = confusion_matrix(y_test1, y_pred)
print('Confusion Matrix\n')
print(confusion)

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test1, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test1, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test1, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test1, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test1, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test1, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test1, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test1, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test1, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test1, y_pred, average='weighted')))


print('\nClassification Report\n')
print(classification_report(y_test1, y_pred, target_names=label))

Confusion Matrix

[[1543    0    0    0    0]
 [   0 2285    0    0    0]
 [   0    0 1325    0    0]
 [   0    0    0 1983    0]
 [   0    0    0    0 1096]]

Accuracy: 1.00

Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 1.00
Macro Recall: 1.00
Macro F1-score: 1.00

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00

Classification Report

                    precision    recall  f1-score   support

           Neutral       1.00      1.00      1.00      1543
          Positive       1.00      1.00      1.00      2285
Extremely Negative       1.00      1.00      1.00      1325
          Negative       1.00      1.00      1.00      1983
Extremely Positive       1.00      1.00      1.00      1096

          accuracy                           1.00      8232
         macro avg       1.00      1.00      1.00      8232
      weighted avg       1.00      1.00      1.00      8232

