# Setiment analysis
- Data preprocessing
- tokenizing
- model building
- Training and Evaluation

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import nltk
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
import torch 
from datasets import load_dataset

2024-06-23 13:53:59.748190: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 13:53:59.748300: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 13:53:59.903305: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3
Note: you may need to restart the kernel to use updated packages.


## Data preprocessing
- loading data using nltk
- Turning it into a dataframe
- encoding the labels
- removing special characters and stopwords
- spliting the data

In [4]:
nltk.download('twitter_samples')
nltk.download('punkt')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
positive=twitter_samples.strings("positive_tweets.json")
negative=twitter_samples.strings("negative_tweets.json")
neutral=twitter_samples.strings("tweets.20150430-223406.json")

pos_df=pd.DataFrame({'tweet': positive, 'label': 'positive'})
neg_df=pd.DataFrame({'tweet': negative, 'label': 'negative'})
nuet_df=pd.DataFrame({'tweet': neutral, 'label': 'neutral'})

data=pd.concat([pos_df,neg_df,nuet_df])

In [6]:
labels={'negative': 0, 'neutral': 1, 'positive': 2}
data['label']=data['label'].map(labels)

In [7]:
data['tweet'] = data['tweet'].str.replace(r'@\w+', '', regex=True)
data['tweet'] = data['tweet'].str.replace(r'#\w+', '', regex=True)
data['tweet'] = data['tweet'].str.replace(r'RT', '', regex=True)

In [8]:
data['tweet']=data['tweet'].apply(nfx.remove_stopwords)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"],data['label'], test_size=0.3, random_state=22)

## Tokenizing
- create a dataframe for train and test sets using train_test_split output.
- define BERT Tokenzer and create a tokenizing funtion.
- create a tokenized train and test set.
- add labels to tokenized datasets.
- format tokenized datasets to be in tensors.

In [10]:
train_data=pd.DataFrame({'tweet':X_train,'label':y_train})
test_data=pd.DataFrame({'tweet':X_test,'label':y_test})

In [11]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
train_df=Dataset.from_pandas(train_data)
test_df=Dataset.from_pandas(test_data)

token_train=train_df.map(tokenize,batched=True)
token_test =test_df.map(tokenize,batched=True)

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [13]:
token_train=token_train.add_column('labels',train_df['label'])
token_test=token_test.add_column('labels',test_df['label'])

token_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
token_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

## Model building
- load BERT model for classification.
- define training arguments using 'TraininArguments()' function.
- define a function for computing accuracy,precision,recall,and F1 score.
- define trainer.

In [14]:
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)

training=TrainingArguments(
    output_dir='./results',
    run_name='sentiment',
    evaluation_strategy="epoch",       
    learning_rate=2e-5,                
    per_device_train_batch_size=8,     
    per_device_eval_batch_size=8,      
    num_train_epochs=5,                
    weight_decay=0.01,                 
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def metrics(p):
    pred=np.argmax(p.predictions,axis=1)
    pres,rec,f1,_=precision_recall_fscore_support(p.label_ids,pred,average='weighted')
    acc=accuracy_score(p.label_ids,pred)
    return{
        'accuracy:':acc,
        'precision:':pres,
        'recall:':rec,
        'f1 score:':f1
    }

trainer=Trainer(
   model=model,
   args=training,
   train_dataset=token_train,
   eval_dataset=token_test,
   tokenizer=tokenizer,
   compute_metrics=metrics
)
   

## Training and evaluation
- the data trained for 5 epoch witha batch size of 8 per device.
- training time:1 hour and 44 minutes.
- GPU T4 x2 was used for training.

In [16]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy:,Precision:,Recall:,F1 score:
1,0.0121,0.011282,0.998222,0.998234,0.998222,0.998224
2,0.0063,0.00872,0.998889,0.998893,0.998889,0.99889
3,0.0039,0.010091,0.998889,0.998893,0.998889,0.99889
4,0.0027,0.008979,0.999111,0.999114,0.999111,0.999112
5,0.0,0.009234,0.999111,0.999114,0.999111,0.999112




TrainOutput(global_step=6565, training_loss=0.008580492531933225, metrics={'train_runtime': 6332.2979, 'train_samples_per_second': 16.582, 'train_steps_per_second': 1.037, 'total_flos': 2.762690886144e+16, 'train_loss': 0.008580492531933225, 'epoch': 5.0})

In [17]:
results = trainer.evaluate()
print(results)



{'eval_loss': 0.00923437811434269, 'eval_accuracy:': 0.9991111111111111, 'eval_precision:': 0.9991139976368262, 'eval_recall:': 0.9991111111111111, 'eval_f1 score:': 0.9991115377994043, 'eval_runtime': 156.9644, 'eval_samples_per_second': 57.338, 'eval_steps_per_second': 3.587, 'epoch': 5.0}
