In [4]:
!pip install transformers torch scikit-learn pandas




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np


In [6]:
# Load the dataset
file_path = '/content/Leads Table.csv'  # The path where your file is uploaded
data = pd.read_csv(file_path)

# Inspect the first few rows
print(data.head())

# We'll use the 'notes' column for reviews (or feedback) and create a target column 'sentiment'
data['sentiment'] = data['notes'].apply(lambda x: 'POSITIVE' if 'good' in str(x).lower() else ('NEGATIVE' if 'bad' in str(x).lower() else 'NEUTRAL'))

# Check the distribution of sentiments
print(data['sentiment'].value_counts())


                                     id first_name last_name  \
0  a58faac7-36af-4dde-9e23-394c445e9e2f     Sandra   Johnson   
1  945d3f97-7632-4c81-a2ab-f10ef1554633       John     Clark   
2  64dd2f3b-6f6d-499f-894d-4cb23fb71847   Michelle    Wilson   
3  dd128bc5-022b-4190-8b4a-d3007c3da0cf   Michelle     Moore   
4  92b0807a-09ad-4f37-bf74-76dfe561ee32    Charles      Hill   

                         email              phone          company   position  \
0   sandra.johnson@outlook.com  +1 (439) 490-4207    Willy Wonka's    Founder   
1       john.clark@outlook.com       840-523-7347   Planet Express        CMO   
2  michelle.wilson@hotmail.com       356.819.8838     Monsters Inc        CMO   
3    michelle.moore@yandex.com     (360) 601-4225       Gekko & Co    Analyst   
4      charles.hill@icloud.com                NaN  Massive Dynamic  Team Lead   

  priority     status   source    value                  created_at  \
0     high   proposal   social  21761.0  2024-08-16 15:00

In [7]:
# Split data into training and test sets
X = data['notes']
y = data['sentiment']

# Convert labels to numeric (positive: 0, negative: 1, neutral: 2)
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 400
Test set size: 100


In [8]:
# Load the tokenizer and model for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Prepare the dataset
def encode_data(texts, tokenizer, max_length=256):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [9]:
# Load the tokenizer and model for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Prepare the dataset
def encode_data(texts, tokenizer, max_length=256):
    # Convert the input to a list of strings to handle mixed data types
    texts = [str(text) for text in texts.tolist()]  # Ensure all elements are strings
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from torch.utils.data import Dataset, DataLoader

# Create a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

# DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed, loss: {loss.item()}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 completed, loss: 0.0021052213851362467
Epoch 2 completed, loss: 0.0006325658760033548
Epoch 3 completed, loss: 0.00027843256248161197


In [11]:
# Evaluate the model
model.eval()

predictions = []
probs = []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(**batch)
        logits = outputs.logits
        probs_batch = torch.nn.functional.softmax(logits, dim=-1)  # Get probabilities
        probs.append(probs_batch)
        pred = torch.argmax(probs_batch, dim=-1)
        predictions.extend(pred.numpy())

# Convert predictions and probabilities to numpy arrays
probs = torch.cat(probs, dim=0).numpy()
predictions = np.array(predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Show classification report
print(classification_report(y_test, predictions, target_names=le.classes_))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 100.00%
              precision    recall  f1-score   support

     NEUTRAL       1.00      1.00      1.00       100

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [15]:
inputs = tokenizer("The product is extremely disappointing. It stopped working after only a few days, and the customer service was unhelpful.", return_tensors="pt", padding=True, truncation=True, max_length=256)
print(inputs)


{'input_ids': tensor([[  101,  1996,  4031,  2003,  5186, 15640,  1012,  2009,  3030,  2551,
          2044,  2069,  1037,  2261,  2420,  1010,  1998,  1996,  8013,  2326,
          2001,  4895, 16001, 14376,  5313,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}


In [21]:
from transformers import pipeline

# Use a sentiment analysis pipeline with a fine-tuned model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

review = "This is by far the best purchase I’ve made in a long time. Worth every penny!"

sentiment = sentiment_analyzer(review)
print(f"Sentiment: {sentiment}")


Device set to use cuda:0


Sentiment: [{'label': 'POSITIVE', 'score': 0.9997574687004089}]


In [22]:
review = "This is by far the best purchase I’ve made in a long time. Worth every penny!"

sentiment = sentiment_analyzer(review)
print(f"Sentiment: {sentiment}")

Sentiment: [{'label': 'POSITIVE', 'score': 0.9997574687004089}]


In [24]:
review = "The product is extremely disappointing. It stopped working after only a few days, and the customer service was unhelpful."
sentiment = sentiment_analyzer(review)
print(f"Sentiment: {sentiment}")

Sentiment: [{'label': 'NEGATIVE', 'score': 0.9998055100440979}]
