In [27]:
!pip install simpletransformers &> /dev/null
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')
import string
exclude = string.punctuation
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
train_data = pd.read_csv("content/2018-EI-reg-En-anger-test-gold.txt",sep="\t")
test_data = pd.read_csv("content/EI-reg-En-anger-train.txt",sep="\t")

In [29]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1002 non-null   object 
 1   Tweet             1002 non-null   object 
 2   Affect Dimension  1002 non-null   object 
 3   Intensity Score   1002 non-null   float64
dtypes: float64(1), object(3)
memory usage: 31.4+ KB


In [30]:
train_data["Tweet"]

0       @PageShhh1 I know you mean well but I'm offend...
1       Let go of resentment, it will hold you back, d...
2       No, I'm not 'depressed because of the weather,...
3       #AmarnathTerrorAttack  Muslims are killing eve...
4       Prepare to suffer the sting of Ghost Rider's p...
                              ...                        
997     That morning when you get half-way to work and...
998     @SenecaReads @zimyix @MouthyBuddha I bet he ne...
999     @markaw214 @appleofeden_3 @jjasq @XxPLWxX Ring...
1000    Have to go to a occupational services place fo...
1001                             Of course Molina #bitter
Name: Tweet, Length: 1002, dtype: object

In [31]:
# data preprocessing class
class Preprocessing:

  # lowercasing
  def convert_lowercase(self,text):
      text = text.lower()
      return text
  
  # removing usernames
  def remove_username(self, text):
    re_user = re.sub('@[^\s]+','',text)
    return re_user


  # removing html tags
  def remove_html_tags(self,text):
      re_html = re.compile('<.*?>')
      return re_html.sub(r'', text)

  # removing URLS
  def remove_url(self,text):
      re_url = re.compile('https?://\S+|www\.\S+')
      return re_url.sub('', text)

  # removing punctuations
  def remove_punc(self,text):
      return text.translate(str.maketrans('', '', exclude))

  # removing special characters
  def remove_special(self,text):
      x=''
      for i in text:
          if i.isalnum():
              x=x+i
          else:
              x=x+' '
      return x

  # removing digits
  def remove_digits(self,text):
      filtered_string = ''.join((x for x in text if not x.isdigit()))
      return filtered_string


  def preprocess(self,text):
    full_text = []
    for sent in text:
      sent = self.remove_username(sent)
      sent = self.convert_lowercase(sent)
      sent = self.remove_html_tags(sent)
      sent = self.remove_url(sent)
      sent = self.remove_digits(sent)
      sent = self.remove_punc(sent)
      sent = self.remove_special(sent)
      full_text.append(sent)
    return full_text

In [32]:
# preprocessing train and test data

train_data["Tweet"] = Preprocessing().preprocess(train_data["Tweet"])
test_data["Tweet"] = Preprocessing().preprocess(test_data["Tweet"])

In [33]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_data = train_data[["Tweet","Intensity Score"]]
test_data = test_data[["Tweet","Intensity Score"]]

# Enabling regression
# Setting optional model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs = 10
model_args.regression = True

# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args    
)
# Train the model
model.train_model(train_data)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_data)

# get pearson-r correlation score
from scipy.stats.stats import pearsonr
score = pearsonr(test_data["Intensity Score"],model_outputs)
print("The pearson-r score is {}".format(score))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

  0%|          | 0/1002 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/126 [00:00<?, ?it/s]



Running Epoch 1 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/126 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/126 [00:00<?, ?it/s]



  0%|          | 0/1701 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/213 [00:00<?, ?it/s]

The pearson-r score is (0.7421479601739319, 1.5035577123227295e-297)


In [25]:
# removing forlder
%rm -rf outputs
%rm -rf <folder_name>