In [1]:
!pip install kaggle



In [2]:
from google.colab import files

# Upload the Kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

# Unzip the downloaded dataset
!unzip -q twitter-entity-sentiment-analysis.zip -d twitter-sentiment-analysis

Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 76.7MB/s]


In [5]:
import pandas as pd

df1 = pd.read_csv('./twitter-sentiment-analysis/twitter_training.csv')
df2 = pd.read_csv('./twitter-sentiment-analysis/twitter_validation.csv')

In [6]:
# Adding columns
df1.columns = ['tweet_id', 'entity', 'output', 'tweet']
df2.columns = ['tweet_id', 'entity', 'output', 'tweet']

df = pd.concat([df1, df2], ignore_index=True)
# Display the DataFrame with the new column names
df.head()

Unnamed: 0,tweet_id,entity,output,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
df.dropna()

Unnamed: 0,tweet_id,entity,output,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
75675,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
75676,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
75677,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
75678,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [8]:
#dataset info
df['output'].value_counts()

Negative      22808
Positive      21108
Neutral       18603
Irrelevant    13161
Name: output, dtype: int64

In [9]:
print(df.dtypes)

tweet_id     int64
entity      object
output      object
tweet       object
dtype: object


In [10]:
df['tweet'] = df['tweet'].astype(str)
print(df.dtypes)

tweet_id     int64
entity      object
output      object
tweet       object
dtype: object


## **Preprocessing of text**
**Lowercasing:**

Convert all text to lowercase to ensure uniformity in the text data.

**Removing Special Characters and Punctuation:**

Remove special characters, such as '@', '#', and punctuation marks, as they may not contribute significantly to sentiment analysis.

**Removing URLs:**

Remove URLs or hyperlinks as they do not contain meaningful sentiment information.

**Removing Stopwords:**

Remove common stopwords (e.g., 'the', 'and', 'is') that do not carry much sentiment information. You can use a predefined list of stopwords for this purpose.

**Tokenization:**

Tokenize the text into individual words or phrases. This involves breaking down the text into a list of words.

**Lemmatization or Stemming:**

Reduce words to their base or root form. Lemmatization and stemming help in standardizing words, so variations of the same word are treated as one.

**Handling Emoticons and Abbreviations:**

Replace emoticons or commonly used abbreviations with their full forms to capture their sentiment.

**Removing Numeric Values:**

Remove any numerical values, as they may not contribute much to sentiment analysis.

In [11]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:

# Define a function for tweet preprocessing
def preprocess_tweet(tweet):


    # Lowercasing
    tweet = tweet.lower()

    # Removing URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)

    # Removing special characters and punctuation
    tweet = re.sub(r'\W', ' ', tweet)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
    tweet = ' '.join([word for word in word_tokens if word not in stop_words])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tweet = ' '.join([lemmatizer.lemmatize(word) for word in tweet.split()])

    return tweet

# Apply the preprocessing function to the 'tweet' column
df['cleaned_tweet'] = df['tweet'].apply(preprocess_tweet)

# Display the DataFrame with the new 'cleaned_tweet' column
df[['tweet', 'cleaned_tweet']]

Unnamed: 0,tweet,cleaned_tweet
0,I am coming to the borders and I will kill you...,coming border kill
1,im getting on borderlands and i will kill you ...,im getting borderland kill
2,im coming on borderlands and i will murder you...,im coming borderland murder
3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder
4,im getting into borderlands and i can murder y...,im getting borderland murder
...,...,...
75675,⭐️ Toronto is the arts and culture capital of ...,toronto art culture capital canada wonder want...
75676,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,actually good move tot bring viewer one people...
75677,Today sucked so it’s time to drink wine n play...,today sucked time drink wine n play borderland...
75678,Bought a fraction of Microsoft today. Small wins.,bought fraction microsoft today small win


In [13]:
df.drop_duplicates('cleaned_tweet')
df.head()

Unnamed: 0,tweet_id,entity,output,tweet,cleaned_tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderland murder


In [14]:
#dataset info
df['output'].value_counts()

Negative      22808
Positive      21108
Neutral       18603
Irrelevant    13161
Name: output, dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder

# Assume df['output'] is the column you encoded
# You can replace it with the actual column name from your DataFrame

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to your labels and transform them
df['encoded_output'] = label_encoder.fit_transform(df['output'])

# Print original and encoded labels
original_labels = df['output'].unique()
encoded_labels = df['encoded_output'].unique()

print("Original labels:", original_labels)
print("Encoded labels:", encoded_labels)

# Mapping from original labels to numerical labels
label_mapping = dict(zip(original_labels, encoded_labels))
print("Label Mapping:", label_mapping)

# # Display the DataFrame with the new encoded column
# print(df.head())

Original labels: ['Positive' 'Neutral' 'Negative' 'Irrelevant']
Encoded labels: [3 2 1 0]
Label Mapping: {'Positive': 3, 'Neutral': 2, 'Negative': 1, 'Irrelevant': 0}


In [16]:
#converting label and tweets to list
tweets = df['cleaned_tweet'].values.tolist()
labels = df['encoded_output'].values.tolist()

In [17]:
#Compare the sizes of the two lists
if len(tweets) == len(labels):
    print("The sizes of 'tweets' and 'labels' lists are equal.")
    print(f"Number of elements in each list: {len(tweets)}")
else:
    print("The sizes of 'tweets' and 'labels' lists are not equal.")
    print(f"Number of elements in 'tweets' list: {len(tweets)}")
    print(f"Number of elements in 'labels' list: {len(labels)}")

The sizes of 'tweets' and 'labels' lists are equal.
Number of elements in each list: 75680


In [18]:
from sklearn.model_selection import train_test_split

# Assuming 'X' is your feature matrix and 'y' is your target variable
X_train, X_temp, y_train, y_temp = train_test_split(tweets, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [19]:
#install transformer models
!pip install transformers==4.36.2

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [20]:
from transformers import DistilBertTokenizerFast,TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased",do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [21]:
# train_encodings = tokenizer(tweets, truncation=True, padding = True,return_tensors="tf")
# print(train_encodings[0])
#train_encodings = tokenizer(v_tweets,truncation=True,padding = True)


max_len= 128
# Tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(X_train,
                                              padding=True,
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

X_val_encoded = tokenizer.batch_encode_plus(X_val,
                                              padding=True,
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

X_test_encoded = tokenizer.batch_encode_plus(X_test,
                                              padding=True,
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

In [22]:
k = 0
print('Training Comments -->>',X_train[k])
print('\nInput Ids -->>\n',X_train_encoded['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(X_train_encoded['input_ids'][k]))
print('\nAttention Mask -->>\n',X_train_encoded['attention_mask'][k])
print('\nLabels -->>',y_train[k])

Training Comments -->> took tio home monday get supply something life house excited really love dad

Input Ids -->>
 tf.Tensor(
[  101  2165 14841  2080  2188  6928  2131  4425  2242  2166  2160  7568
  2428  2293  3611   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(128,), dtype=int32)

Decoded Ids -->>
 [CLS] took tio home monday get supply somethi

In [23]:
# Intialize the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [24]:
# Compile the model with an appropriate optimizer, loss function, and metrics
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [25]:
import numpy as np
y_train = np.array(y_train).astype('float32')
y_val = np.array(y_val).astype('float32')

In [26]:
history = model.fit(
    x=[X_train_encoded['input_ids'], X_train_encoded['attention_mask']],
    y=y_train,
    validation_data=(
        [X_val_encoded['input_ids'],  X_val_encoded['attention_mask']], y_val),
    batch_size=32,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
y_test = np.array(y_test).astype('float32')
y_val = np.array(y_val).astype('float32')

In [34]:
k = 3
print('Training Comments -->>',X_test[k])
print('\nInput Ids -->>\n',X_test_encoded['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(X_test_encoded['input_ids'][k]))
print('\nAttention Mask -->>\n',X_test_encoded['attention_mask'][k])
print('\nLabels -->>',y_test[k])

Training Comments -->> people complaining op playvalorant never played sg csgo

Input Ids -->>
 tf.Tensor(
[  101  2111 17949  6728  2377 10175 18842  2102  2196  2209 22214 20116
  3995   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(128,), dtype=int32)

Decoded Ids -->>
 [CLS] people complaining op playvalorant never played sg csgo [SEP

In [36]:
#Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(
    [X_val_encoded['input_ids'],  X_val_encoded['attention_mask']],
    y_val
)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

Test loss: 0.3734036982059479, Test accuracy: 0.8691860437393188


In [37]:
path = './tokenizer_and_model'
# Save tokenizer
tokenizer.save_pretrained(path +'/Tokenizer')

# Save model
model.save_pretrained(path +'/Model')

In [39]:
# Load tokenizer
bert_tokenizer = DistilBertTokenizerFast.from_pretrained(path +'/Tokenizer')

# Load model
bert_model = TFDistilBertForSequenceClassification.from_pretrained(path +'/Model')

Some layers from the model checkpoint at ./tokenizer_and_model/Model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./tokenizer_and_model/Model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
pred = bert_model.predict(
    [X_val_encoded['input_ids'],  X_val_encoded['attention_mask']])

# pred is of type TFSequenceClassifierOutput
logits = pred.logits

# Use argmax along the appropriate axis to get the predicted labels
pred_labels = tf.argmax(logits, axis=1)

# Convert the predicted labels to a NumPy array
pred_labels = pred_labels.numpy()

label = {
    3:'Positive',
    2:'Neutral' ,
    1:'Negative',
    0:'Irrelevant'}


# Map the predicted labels to their corresponding strings using the label dictionary
pred_labels = [label[i] for i in pred_labels]
Actual = [label[i] for i in y_val]

print('Predicted Label :', pred_labels[:10])
print('Actual Label    :', Actual[:10])

Predicted Label : ['Positive', 'Neutral', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Irrelevant', 'Positive', 'Positive']
Actual Label    : ['Positive', 'Neutral', 'Negative', 'Negative', 'Positive', 'Neutral', 'Positive', 'Irrelevant', 'Positive', 'Positive']


In [43]:
from sklearn.metrics import classification_report

print("Classification Report: \n", classification_report(Actual, pred_labels))

Classification Report: 
               precision    recall  f1-score   support

  Irrelevant       0.92      0.81      0.86      1927
    Negative       0.92      0.88      0.90      3422
     Neutral       0.88      0.82      0.85      2776
    Positive       0.79      0.93      0.86      3227

    accuracy                           0.87     11352
   macro avg       0.88      0.86      0.87     11352
weighted avg       0.87      0.87      0.87     11352



In [52]:
def Get_sentiment(Review, Tokenizer=bert_tokenizer, Model=bert_model):
    # Convert Review to a list if it's not already a list
    if not isinstance(Review, list):
        Review = [Review]

    Input_ids, Attention_mask = Tokenizer.batch_encode_plus(Review,
                                                                             padding=True,
                                                                             truncation=True,
                                                                             max_length=128,
                                                                             return_tensors='tf').values()
    prediction = Model.predict([Input_ids, Attention_mask])

    # Use argmax along the appropriate axis to get the predicted labels
    pred_labels = tf.argmax(prediction.logits, axis=1)

    # Convert the TensorFlow tensor to a NumPy array and then to a list to get the predicted sentiment labels
    pred_labels = [label[i] for i in pred_labels.numpy().tolist()]
    return pred_labels

In [58]:
tweet ='''that game is okay. But i don't think I will play it agian'''
Get_sentiment(tweet)



['Positive']