In [None]:
import numpy as np
import pandas as pd

## Reading the *dataset*

In [None]:
df = pd.read_csv("/content/YoutubeCommentsDataSet.csv")

In [None]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18408 entries, 0 to 18407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    18364 non-null  object
 1   Sentiment  18408 non-null  object
dtypes: object(2)
memory usage: 287.8+ KB


In [None]:
df.shape

(18408, 2)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Comment,44
Sentiment,0


In [None]:
df = df.dropna()

## Reducing the dataset using stratify method

In [None]:
from sklearn.model_selection import train_test_split

reduced_df, _ = train_test_split(df, train_size=0.4, stratify=df['Sentiment'], random_state=42)  #Ensures the class distribution in the new dataset is the same as in the original dataset.

print(reduced_df.shape)
print(reduced_df['Sentiment'].value_counts())


(7345, 2)
Sentiment
positive    4560
neutral     1850
negative     935
Name: count, dtype: int64


In [None]:
from torch.nn import CrossEntropyLoss

# Calculate class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(reduced_df['Sentiment']),
    y=reduced_df['Sentiment']
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = CrossEntropyLoss(weight=class_weights)


## Installing all the necessary libraries

In [None]:
pip install transformers torch scikit-learn pandas matplotlib



## Preprocessing the dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer


# Encode labels
label_encoder = LabelEncoder()
reduced_df['Sentiment'] = label_encoder.fit_transform(reduced_df['Sentiment'])

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


## Create pytorch dataset

In [None]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        comment = self.df.iloc[idx]['Comment']
        label = self.df.iloc[idx]['Sentiment']
        encoding = self.tokenizer(
            comment,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze(0) for key in encoding}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item




## Split dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

train_data = reduced_df.sample(frac=0.8, random_state=42)
val_data = reduced_df.drop(train_data.index)

train_dataset = SentimentDataset(train_data, tokenizer)
val_dataset = SentimentDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


## Load the DistilBERT Model

In [None]:
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=1e-5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the model

In [None]:
from torch.optim import Adam
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)


# Run training
epochs = 10
for epoch in range(epochs):
    loss = train_epoch(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}")


Epoch 1/10, Loss: 0.6725
Epoch 2/10, Loss: 0.3948
Epoch 3/10, Loss: 0.2466
Epoch 4/10, Loss: 0.1610
Epoch 5/10, Loss: 0.1172
Epoch 6/10, Loss: 0.0892
Epoch 7/10, Loss: 0.0653
Epoch 8/10, Loss: 0.0620
Epoch 9/10, Loss: 0.0402
Epoch 10/10, Loss: 0.0369


In [None]:
from sklearn.metrics import accuracy_score

def evaluate(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

# Test the model
accuracy = evaluate(model, val_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 82.03%


## Predicting the model

In [None]:
def predict(model, comment):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            comment,
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()


        # Decode the label (use inverse transform from label encoder)
        sentiment = label_encoder.inverse_transform([prediction])[0]
        return sentiment


# Example prediction
comment = "It could be an effective strategy, though there are pros and cons to consider."
sentiment = predict(model, comment)
print(f"Sentiment: {sentiment}")


Sentiment: neutral


In [None]:
# Save only the model weights
torch.save(model.state_dict(), 'model.pth')
