# Processing the data





Install the Transformers and Datasets libraries to run this notebook.

In [2]:
! pip install datasets transformers[sentencepiece]

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 4.2 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 36.6 MB/s 
Collecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.1 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 51.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 45.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.0 MB/s 
[?25hCollec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***EXPLORING THE DATASET***

In [3]:
import pandas as pd
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from datasets import load_metric

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/NLP -Hugging Face/Tweet Sentiment Analysis/train.csv')

test_data = pd.read_csv('/content/drive/MyDrive/NLP -Hugging Face/Tweet Sentiment Analysis/test.csv')
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
train_data['text'][:5]

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object

In [None]:
test_data.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


TRAIN DATA

In [None]:
train_data['text_len'] = train_data['text'].str.split().str.len()
#adding it to the dataframe
train_data.describe()

Unnamed: 0,text_len
count,27480.0
mean,12.902875
std,6.925938
min,1.0
25%,7.0
50%,12.0
75%,18.0
max,33.0


Count is number of sentences
max is max count of words in a sentence

In [None]:
train_data['text']

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27481, dtype: object

In [None]:
train_data['sentiment'].value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [None]:
labels = train_data['sentiment'].unique().tolist() # list of all 6 emotions
labels_dict = {k: v for v, k in enumerate(labels)} #make s dictionary out of them
labels_dict

{'negative': 1, 'neutral': 0, 'positive': 2}

In [None]:
train_data['labels'] = train_data['sentiment'].map(labels_dict)
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_len,labels
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7.0,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10.0,1
2,088c60f138,my boss is bullying me...,bullying me,negative,5.0,1
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5.0,1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14.0,1


## ***VALIDATION DATASET***

In [None]:
test_data['sentiment'].value_counts()

neutral     1430
positive    1103
negative    1001
Name: sentiment, dtype: int64

In [None]:
test_data['text_len'] = test_data['text'].str.split().str.len()
test_data['labels'] = test_data['sentiment'].map(labels_dict)
test_data.head()

Unnamed: 0,textID,text,sentiment,text_len,labels
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,6,0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,15,2
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,13,1
3,01082688c6,happy bday!,positive,2,2
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,5,2


In [None]:
train_data =train_data.dropna()
test_data = test_data.dropna()

# ***TOKENIZING***

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_data)
train_dataset, train_dataset[0]

(Dataset({
     features: ['textID', 'text', 'selected_text', 'sentiment', 'text_len', 'labels', '__index_level_0__'],
     num_rows: 27480
 }),
 {'__index_level_0__': 0,
  'labels': 0,
  'selected_text': 'I`d have responded, if I were going',
  'sentiment': 'neutral',
  'text': ' I`d have responded, if I were going',
  'textID': 'cb774db0d1',
  'text_len': 7.0})

In [None]:
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_data)
test_dataset, test_dataset[0]

(Dataset({
     features: ['textID', 'text', 'sentiment', 'text_len', 'labels', '__index_level_0__'],
     num_rows: 3534
 }),
 {'__index_level_0__': 0,
  'labels': 0,
  'sentiment': 'neutral',
  'text': 'Last session of the day  http://twitpic.com/67ezh',
  'textID': 'f87dea47db',
  'text_len': 6})

In [None]:
from transformers import AutoTokenizer
model_checkpoint = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
train_dataset[100]

{'__index_level_0__': 100,
 'labels': 2,
 'selected_text': 'Pretty',
 'sentiment': 'positive',
 'text': '4am. And Im on the beach. Pretty',
 'textID': '2207d982bc',
 'text_len': 7.0}

In [None]:
import torch
max_len = 100  
#ASK how do u know
pad_to_max = True
def tokenize_data(example):
    # Tokenize the review body
    text_ = example['text']
    print(text_)
    encodings = tokenizer.encode_plus(text_, padding=True, max_length=max_len,
                                            truncation=True,
                                           add_special_tokens=True,
                                            return_token_type_ids=False,
                                            return_attention_mask=True,
                                            return_overflowing_tokens=False,
                                            return_special_tokens_mask=False,
                                           )
    
    # Subtract 1 from labels to have them in range 0-4
    targets = torch.tensor(example['labels'],dtype=torch.long)
    

    encodings.update({'labels': targets})
    return encodings

In [None]:
encoded_train_dataset = train_dataset.map(tokenize_data)
encoded_test_dataset = test_dataset.map(tokenize_data)

#ask

  0%|          | 0/27480 [00:00<?, ?ex/s]

  0%|          | 0/3534 [00:00<?, ?ex/s]

In [None]:
encoded_train_dataset.column_names

Two more - 'input_ids', 'attention_mask'added

In [None]:
encoded_train_dataset[:5]

# ***LOADING THE MODEL***

https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
batch_size = 128
num_labels = 6


model_checkpoint = 'roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# ***TRAINER***

In [None]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
print(metrics_list)

In [None]:
acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

In [None]:
metric_name = "accuracy" #why only accuracy

args = TrainingArguments(
    output_dir = "test-results-concat",
    seed = 125, 
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_steps = 100,
    save_steps = 100,
    fp16 = False
)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    print(acc, f1,recall, precision)
    return {"accuracy": acc['accuracy'], "f1": f1['f1'],"recall": recall['recall'],"precision": precision['precision']} 

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset= encoded_train_dataset, 
    eval_dataset=encoded_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
!nvidia-smi #ask

In [None]:
trainer.train()

In [None]:
import numpy as np
trainer.evaluate()

In [None]:
trainer.save_model('Roberta_classification_model')

In [None]:
!zip -r 'Roberta_classification_model.zip' 'Roberta_classification_model'


In [None]:
!mv 'Roberta_classification_model.zip' 
#old location, new location

# ***TEST***

In [None]:
test_data['text_len'] = test_data['text'].str.split().str.len()
test_data['labels'] = test_data['emotion'].map(labels_dict)
test_data.head()

Unnamed: 0,text,emotion,text_len,labels
0,im feeling rather rotten so im not very ambiti...,sadness,11,0
1,im updating my blog because i feel shitty,sadness,8,0
2,i never make her separate from me because i do...,sadness,22,0
3,i left with my bouquet of red and yellow tulip...,joy,21,5
4,i was feeling a little vain when i did this one,sadness,11,0


In [None]:
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_data)
test_dataset, test_dataset[0]

(Dataset({
     features: ['text', 'emotion', 'text_len', 'labels'],
     num_rows: 2000
 }),
 {'emotion': 'sadness',
  'labels': 0,
  'text': 'im feeling rather rotten so im not very ambitious right now',
  'text_len': 11})

In [None]:
encoded_test_dataset = test_dataset.map(tokenize_data)

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [None]:
encoded_test_dataset.column_names

['text', 'emotion', 'text_len', 'labels', 'input_ids', 'attention_mask']

In [None]:
test_predictions = trainer.predict(encoded_test_dataset )

test_preds = np.argmax(test_predictions.predictions, axis=-1)

acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

acc_metric.compute(predictions=test_preds, references=test_predictions.label_ids)
f1_metric.compute(predictions=test_preds, references=test_predictions.label_ids, average='weighted')
precision_metric .compute(predictions=test_preds, references=test_predictions.label_ids,average='weighted')
recall_metric.compute(predictions=test_preds, references=test_predictions.label_ids,average='weighted')

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, text_len, emotion.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 128


{'accuracy': 0.9215} {'f1': 0.9219627564689034} {'recall': 0.9215} {'precision': 0.92296837488362}


{'f1': 0.9219627564689034}

OTHER METHOD

In [None]:
trainer = Trainer(
    model,
    args,

    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, text_len, emotion.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'accuracy': 0.9215} {'f1': 0.9219627564689034} {'recall': 0.9215} {'precision': 0.92296837488362}


{'eval_accuracy': 0.9215,
 'eval_f1': 0.9219627564689034,
 'eval_loss': 0.17647869884967804,
 'eval_precision': 0.92296837488362,
 'eval_recall': 0.9215,
 'eval_runtime': 4.6309,
 'eval_samples_per_second': 431.877,
 'eval_steps_per_second': 3.455}

The test dataset performance is poor as compared to validation dataset