
**Disaster Tweets Sentiment Analysis Notebook**

This notebook is designed to analyze the sentiment of tweets related to disasters using a pre-trained model for sentiment classification. It includes the following steps:

**1. Import Libraries:**

In [33]:
# Import necessary libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
from torch import optim
from datasets import load_dataset
from sklearn.metrics import confusion_matrix

# Import custom functions from src directory
from src.preprocessing import preprocess_text
from src.utils import calculate_accuracy, tokenize_text
from src.prediction_utils import predict_sentiment

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

**2. Data Loading and Preprocessing**

In [35]:
# Load the dataset from a CSV file
df = pd.read_csv('data/train.csv')

In [36]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [37]:
df.shape

(7613, 5)

In [38]:
# Display the distribution of target values in the dataset
df['target'].value_counts(normalize='true')

0    0.57034
1    0.42966
Name: target, dtype: float64

In [39]:
# Select relevant columns for further processing
data = df[['text', 'target']]

In [40]:
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [41]:
# Apply text preprocessing to the 'text' column
data['text'] = data['text'].apply(preprocess_text)

In [42]:
data.head()

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,13000 people receive wildfires evacuation orde...,1
4,got sent photo ruby alaska smoke wildfires pou...,1


In [43]:
# Map the 'target' column to binary values (0 or 1)
data['target'] = data['target'].apply(lambda x: 1 if x==0 else 0)

In [44]:
data.head()

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,0
1,forest fire near la ronge sask canada,0
2,residents asked shelter place notified officer...,0
3,13000 people receive wildfires evacuation orde...,0
4,got sent photo ruby alaska smoke wildfires pou...,0


**3. Model Loading and Configuration**

In [45]:
# Load model configuration from config.json
with open('config.json', 'r') as f:
    config = json.load(f)

In [46]:
# Extract relevant model configuration parameters
model_name = config['model_config']['model_name']
num_labels = config['model_config']['num_labels']
epochs = config['training_config']['epochs']
learning_rate = config['training_config']['learning_rate']

In [47]:
# Determine the device to use (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
# Load pre-trained model and tokenizer from Hugging Face Transformers
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [49]:
# Configure the model for the specified number of labels
model.config.num_labels = num_labels

**4. Data Preparation for Training**

In [51]:
# Tokenize the text data and convert to PyTorch tensors
training_data = [tokenize_text(text, tokenizer, device) for text in data['text']]
training_labels = data['target']

**5. Model Training**

In [52]:
# Initialize the optimizer and loss function
optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
criterion = torch.nn.CrossEntropyLoss()

In [54]:
# Training loop
for epoch in range(epochs):
  outputs = []
  model.train()

  for text, label in zip(training_data, training_labels):

    output = model(**text)
    loss = criterion(output.logits, torch.tensor([label]).to(device))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    outputs.append(torch.argmax(output.logits).item())

  # Calculate and print accuracy for the current epoch
  accuracy = calculate_accuracy(outputs, training_labels)
  print(f'epoch: {epoch} has an Accuracy of {accuracy}')

epoch: 0 has an Accuracy of 80.65152
epoch: 1 has an Accuracy of 86.31289
epoch: 2 has an Accuracy of 90.55563
epoch: 3 has an Accuracy of 93.3272
epoch: 4 has an Accuracy of 95.08735
epoch: 5 has an Accuracy of 95.5865
epoch: 6 has an Accuracy of 96.19073
epoch: 7 has an Accuracy of 96.6636
epoch: 8 has an Accuracy of 96.90004
epoch: 9 has an Accuracy of 96.83436
epoch: 10 has an Accuracy of 97.18902
epoch: 11 has an Accuracy of 97.08394
epoch: 12 has an Accuracy of 97.41232
epoch: 13 has an Accuracy of 97.66189
epoch: 14 has an Accuracy of 97.55681


**6. Sentiment Prediction on Entire Data**

In [55]:
# Make predictions on the entire dataset
predicted_sentiments = []
for text in data["text"]:
    encoded_text = tokenize_text(text, tokenizer, device)
    predicted_sentiment = predict_sentiment(model, encoded_text)
    predicted_sentiments.append(predicted_sentiment)

**7. Evaluation**

In [56]:
# Assuming `data['target']` contains true labels and `predicted_sentiments` contains predictions
result = confusion_matrix(data['target'], predicted_sentiments)
print(result)

[[3139  132]
 [  31 4311]]


**8. Saving the model**

In [57]:
model.save_pretrained('model')