In [9]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load train dataset
train_data = pd.read_csv("train.csv")

train_data

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [3]:
# Initialize the sentiment analysis pipeline
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [4]:
# Define sentiment mapping for comparison
sentiment_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral'}

In [5]:
predictions = []
# Predict sentiments
for text in tqdm(train_data['text'], desc='Predicting sentiments', unit='text'):
    # Make sure `text` is a string
    if isinstance(text, str):
        result = classifier(text)
        sentiment = result[0]['label']
        predictions.append(sentiment_map.get(sentiment, 'neutral'))
    else:
        print(f"Skipping non-string input: {text}")
        predictions.append('neutral')

# Add predicted sentiments to the DataFrame
train_data['predicted_sentiment'] = predictions

Predicting sentiments:   1%|          | 321/27481 [00:07<09:02, 50.05text/s]

Skipping non-string input: nan


Predicting sentiments: 100%|██████████| 27481/27481 [13:45<00:00, 33.27text/s]  


In [6]:
# Convert sentiments to numeric labels for accuracy calculation
label_map = {'positive': 1, 'negative': -1, 'neutral': 1}
train_data['actual_label'] = train_data['sentiment'].map(label_map)
train_data['predicted_label'] = train_data['predicted_sentiment'].map(label_map)

# Calculate accuracy
accuracy = accuracy_score(train_data['actual_label'], train_data['predicted_label'])
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.65


In [7]:
train_data

Unnamed: 0,textID,text,selected_text,sentiment,predicted_sentiment,actual_label,predicted_label
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,negative,1,-1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,positive,-1,1
2,088c60f138,my boss is bullying me...,bullying me,negative,negative,-1,-1
3,9642c003ef,what interview! leave me alone,leave me alone,negative,negative,-1,-1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,negative,-1,-1
...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,negative,-1,-1
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,positive,-1,1
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,positive,1,1
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,positive,1,1
