In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
from plotly.subplots import make_subplots

from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments,Trainer

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data overview

In [2]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1')
display(train.info(),
       train.describe(),
       train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


None

Unnamed: 0,UserName,ScreenName
count,41157.0,41157.0
mean,24377.0,69329.0
std,11881.146851,11881.146851
min,3799.0,48751.0
25%,14088.0,59040.0
50%,24377.0,69329.0
75%,34666.0,79618.0
max,44955.0,89907.0


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
fig = px.pie(names = train['Sentiment'])
fig.show()

In [5]:
coder = LabelEncoder()
train['Sentiment'] = coder.fit_transform(train['Sentiment'])
train['Sentiment'].value_counts()

Sentiment
4    11422
2     9917
3     7713
1     6624
0     5481
Name: count, dtype: int64

# Tockenization and normalization

In [None]:
train_ds = Dataset.from_pandas(train)
train_ds

In [None]:
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
def tok_func(x): 
    return tokz(x['OriginalTweet'])

In [None]:
tok_train = train_ds.map(tok_func, batched=True)
tok_train

In [None]:
row = tok_train[0]
row['OriginalTweet'], row['input_ids']

In [None]:
tok_train = tok_train.rename_columns({'Sentiment':'labels'})

# Training

In [None]:
dds = tok_train.train_test_split(0.25, seed=42)
dds

In [None]:
batch_size = 32
epochs = 4
lr = 8e-5

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
from transformers import DebertaV2TokenizerFast, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
# Load pre-trained model and tokenizer
model = DebertaV2ForSequenceClassification.from_pretrained(model_nm, num_labels=5)
tokenizer = DebertaV2TokenizerFast.from_pretrained(model_nm)

# Fine-tune the model using your training data (replace train_dataset with your actual training dataset)
args = TrainingArguments(
    output_dir='./output',
    logging_dir='./logs',
    
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    
    num_train_epochs=epochs,
    evaluation_strategy="epoch", #Specifies after what to evaluate the model during training:"epoch", "steps","no" 
    
    learning_rate=lr,
    lr_scheduler_type='cosine', #a cosine annealing scheduler: The learning rate starts high and gradually decreases in a cosine pattern, 
                                #helping the model converge more smoothly.
                                #'linear', 'constant', or custom learning rate schedulers.
    warmup_ratio=0.1, #Warmup is a technique where the learning rate starts very low and gradually increases for a certain number of steps. 
                      #It helps stabilize training at the beginning.
                      #The warmup ratio is the fraction of total training steps during which the warmup is applied.
    weight_decay=0.01, # regularization technique that penalizes large weights in the model.
                       #This parameter specifies the strength of the weight decay. 
                       #A higher value means stronger regularization.
    
    logging_steps=500,#Log training information every logging_steps steps
    do_train=True,
    fp16=False,  #Whether to use mixed-precision training (16-bit floating-point) to reduce memory usage.
    report_to='none'
)

trainer = Trainer(model, 
                  args, 
                  train_dataset=dds['train'], 
                  eval_dataset=dds['test'],
                  tokenizer=tokz, 
                  compute_metrics=compute_metrics)

trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')


# Model on test data

In [None]:
test_true = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='ISO-8859-1')
test_true['Sentiment']=coder.fit_transform(test_true['Sentiment'])
display(test_true.info(),
       test_true.describe(),
       test_true.head())

In [None]:
test_true_ds = Dataset.from_pandas(test_true).map(tok_func, batched=True)

In [None]:
preds = trainer.predict(test_true_ds).predictions.astype(int)

In [None]:
from sklearn.metrics import classification_report

# Assuming you have true labels in test_true_ds
true_labels = test_true['Sentiment']

# Convert predicted probabilities to labels
pred_labels = preds.argmax(axis=1)

# Generate a classification report
class_report = classification_report(true_labels, pred_labels)

# Print or use the classification report as needed
print(class_report)

# MODULES FOR Sentiment Analysis

In [6]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1')
train.Sentiment.value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [7]:
positive = ['Positive', 'Extremely Positive']
negative = ['Negative', 'Extremely Negative']

def map_sentiment(x):
    if x in positive:
        return 'pos'
    elif x in negative:
        return 'neg'
    else:
        return 'neu'
    
train['Sentiment'] = train['Sentiment'].apply(map_sentiment)
train['Sentiment'].value_counts()

Sentiment
pos    18046
neg    15398
neu     7713
Name: count, dtype: int64

## NLTK (Natural Language Toolkit):

In [24]:
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report

analyzer = SentimentIntensityAnalyzer()
true_sent = train['Sentiment']

# Convert compound scores to categorical labels
predicted_sentiments = []
for tweet in train['OriginalTweet']:
    sentiment_score = analyzer.polarity_scores(tweet)
    sentiment_score.pop('compound', None)
    predicted_sentiment = max(sentiment_score, key=sentiment_score.get)
    predicted_sentiments.append(predicted_sentiment)

# Compute classification report
class_report = classification_report(true_sent, predicted_sentiments)
print(class_report)

              precision    recall  f1-score   support

         neg       0.99      0.00      0.01     15398
         neu       0.19      1.00      0.32      7713
         pos       0.98      0.00      0.01     18046

    accuracy                           0.19     41157
   macro avg       0.72      0.34      0.11     41157
weighted avg       0.83      0.19      0.07     41157



## TextBlob

In [8]:
from textblob import TextBlob

text = "I love Python!"
blob = TextBlob(text)
sentiment = blob.sentiment
print(sentiment)

Sentiment(polarity=0.625, subjectivity=0.6)


## Transformers (Hugging Face)

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import classification_report

# Replace 'distilbert-base-uncased-finetuned-sst-2-english' with your desired model
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create sentiment analysis pipeline with the specified model
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


true_sent = train['Sentiment']
predicted_sentiments = []

for tweet in train['OriginalTweet']:
    result = sentiment_analyzer(tweet)
    if result[0]['label']=='POSITIVE':
        sent = 'pos'
    elif result[0]['label']=='NEGATIVE':
        sent='neg'
    else:
        sent='neu'
    predicted_sentiments.append(sent)

# Compute classification report
class_report = classification_report(true_sent, predicted_sentiments)
print(class_report)

## Multinomial Naive Bayes

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load your dataset
texts = train['OriginalTweet']
labels = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Make predictions
predictions = classifier.predict(X_test_vec)

# Evaluate accuracy
class_report = classification_report(y_test, predictions)
print(class_report)


              precision    recall  f1-score   support

         neg       0.69      0.75      0.72      3830
         neu       0.77      0.18      0.30      1919
         pos       0.66      0.83      0.74      4541

    accuracy                           0.68     10290
   macro avg       0.71      0.59      0.58     10290
weighted avg       0.69      0.68      0.65     10290



## Support Vector Machines (SVM)

**Objective:** The primary objective of SVM is to find a hyperplane in an N-dimensional space (where N is the number of features) that distinctly classifies the data points.

**Hyperplane:** In a two-dimensional space, a hyperplane is a simple line that separates the data into two classes. In three-dimensional space, it becomes a plane, and in higher dimensions, it's referred to as a hyperplane.

**Support Vectors:** These are the data points that lie closest to the decision boundary (hyperplane). They are crucial in defining the position and orientation of the hyperplane. SVM aims to maximize the margin, which is the distance between the support vectors and the hyperplane.

**Kernel Trick:** SVM can efficiently handle non-linear decision boundaries by using the kernel trick. The kernel function transforms the input features into a higher-dimensional space, making it possible to find a hyperplane in that space. Common kernel functions include linear, polynomial, radial basis function (RBF), and sigmoid.

**C Parameter:** SVM introduces a regularization parameter 'C' that influences the trade-off between having a smooth decision boundary and classifying the training points correctly. A small 'C' encourages a larger margin but may misclassify some training points, while a large 'C' aims for correct classification at the cost of a narrower margin.

**SVM for Regression:** In addition to classification, SVM can be used for regression tasks. It's called Support Vector Regression (SVR), and instead of classifying points, it predicts a continuous output.

Pros:

Effective in high-dimensional spaces.
Memory-efficient, as it uses only a subset of training points (support vectors) for decision making.
Versatile due to the availability of different kernel functions.

Cons:

Sensitive to the choice of kernel and parameters.
Computationally intensive, especially for large datasets.

In [12]:
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = train['OriginalTweet']
y = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF.
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM model.
clf = svm.SVC(kernel='linear')
clf.fit(X_train_tfidf, y_train)

predictions = clf.predict(X_test_tfidf)

class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

         neg       0.80      0.82      0.81      3062
         neu       0.76      0.65      0.70      1553
         pos       0.83      0.86      0.84      3617

    accuracy                           0.81      8232
   macro avg       0.80      0.78      0.78      8232
weighted avg       0.80      0.81      0.80      8232

