In [157]:
# # Install necessary libraries
# !pip install transformers numpy pandas torch scikit-learn

# Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [158]:
# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [159]:
# Load the fine-tuned BERT model
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [160]:
tokens = tokenizer.encode('Love this ❤️', return_tensors='pt')
result = fine_tuned_model(tokens)
result

SequenceClassifierOutput(loss=None, logits=tensor([[-2.1243, -2.5862, -0.8446,  1.2741,  3.4680]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [161]:
# from google.colab import files
# files.upload()

In [162]:
df = pd.read_csv('TikTok Data (3).csv')

Cleaning the data, Converting emojis to words, and removing special characters

In [163]:
# !pip install emoji
import re
from emoji import demojize

df['Comment Text'] = df['Comment Text'].str.lower().apply(demojize)
df['Comment Text'] = df['Comment Text'].apply(lambda x: re.sub(r'[:,@_\W]', ' ', x))

In [164]:
df

Unnamed: 0,Comment Text
0,love this red heart
1,pls coca cola pls
2,most definitely nothing wrong with that salut...
3,pepsi the official drink of good vibes smi...
4,yes
...,...
197,pepsi measures a coke
198,
199,measure a box of coca cola
200,


Tokenizing and Preprocessing comments

In [165]:
# Tokenize and preprocess your comments for sentiment analysis
encoded_data = tokenizer(df['Comment Text'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

In [166]:
# Extract BERT embeddings for your comments
with torch.no_grad():
    outputs = fine_tuned_model(**encoded_data)
    logits = outputs.logits  # Use logits instead of pooler_output
    predicted_labels = torch.argmax(logits, dim=1) + 1  # Adding 1 because the model predicts labels from 0 to 4


# Map numeric sentiment labels to categories (Negative, Neutral, Positive)
sentiment_mapping = {1: 'Negative', 2: 'Negative', 3: 'Neutral', 4: 'Positive', 5: 'Positive'}
df['Sentiment_Class'] = pd.Series(predicted_labels.numpy()).map(sentiment_mapping)


KNN Classification

In [167]:
# Specify the feature (X) and target (y) variables for training

X = logits.numpy()  # Assuming logits contains the BERT embeddings

# Use these predicted sentiment labels as target (y) for KNN
y = df['Sentiment_Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

knn_classifier = KNeighborsClassifier()

grid_search = GridSearchCV(knn_classifier, param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['n_neighbors']

print("Best value for k:", best_k)

# Create the KNN classifier with the optimal k value
knn_classifier = KNeighborsClassifier(n_neighbors=best_k)

# Train the KNN classifier on the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn_classifier.predict(X_test)

# Evaluate the performance of the KNN model
print(classification_report(y_test, y_pred))


Best value for k: 3
              precision    recall  f1-score   support

    Negative       1.00      0.89      0.94        18
     Neutral       0.67      1.00      0.80         4
    Positive       0.95      0.95      0.95        19

    accuracy                           0.93        41
   macro avg       0.87      0.95      0.90        41
weighted avg       0.94      0.93      0.93        41

