In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from transformers import BertTokenizer
import numpy as np

# Load the dataset
data = pd.read_csv('tweet_emotions.csv')  # Replace with the actual path to your dataset

# Define the sentiment mapping to 7 categories
sentiment_mapping = {
    'happiness': 'happy',
    'enthusiasm': 'happy',
    'surprise': 'surprise',
    'boredom': 'sadness',
    'hate': 'anger',
    'relief': 'neutral',
    'empty': 'neutral',
    'worry': 'fear',
    'love': 'happy',
    'sadness': 'sadness',
    'anger': 'anger',
    'fun': 'happy'
}

# Apply the sentiment replacements
data['sentiment'] = data['sentiment'].replace(sentiment_mapping)

# Preprocessing: Tokenization and Text Cleaning
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize, remove stopwords, and apply lowercasing
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Apply text preprocessing
data['processed_text'] = data['content'].apply(preprocess_text)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts using BERT tokenizer
def tokenize_text(text):
    return tokenizer.encode(text, add_special_tokens=True, truncation=True, padding='max_length', max_length=64)

data['tokenized'] = data['processed_text'].apply(tokenize_text)

# Prepare the input for RandomForest
# Since Random Forest doesn't accept sequences directly, we need to get the tokenized input into a format suitable for it.
# We will use the BERT embeddings to convert the text into vectors.
from transformers import BertModel
import torch

# Load BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract BERT embeddings
def extract_bert_embeddings(tokens):
    # Convert token IDs to tensor and pass through BERT
    input_ids = torch.tensor(tokens).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        outputs = model(input_ids)
    # Get the embeddings from the [CLS] token (index 0)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings.flatten()  # Flatten to make it 1D

# Extract embeddings for the tokenized data
data['embeddings'] = data['tokenized'].apply(extract_bert_embeddings)

# Prepare the features and labels
X = np.array(data['embeddings'].tolist())
y = data['sentiment'].map({
    'happy': 0,
    'surprise': 1,
    'sadness': 2,
    'anger': 3,
    'neutral': 4,
    'fear': 5
})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['happy', 'surprise', 'sadness', 'anger', 'neutral', 'fear']))

# Save the updated dataset with processed text and sentiment
data.to_csv('updated_dataset.csv', index=False)  # Save to a new CSV file


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [3]:
print(data['sentiment'].unique())

['neutral' 'sadness' 'fun' 'worry' 'love' 'anger']
