<a href="https://www.kaggle.com/code/bcodep06/covid-19-text-classification?scriptVersionId=260837692" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# COVID-19 Tweet Sentiment Classification

This notebook performs text classification on COVID-19 related tweets. The steps include:
1. Data Loading
2. Preprocessing (cleaning, mentions, hashtags, compound words, lemmatization)
3. Train/Test Split
4. TF-IDF Vectorization
5. Label Encoding
6. Model Training with Logistic Regression
7. Evaluation on Training and Test Sets

In [None]:
pip install -U scikit-learn imbalanced-learn

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

## 1. Load Dataset

In [None]:
# Load CSV dataset
df = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv',encoding = 'latin1')

# Select relevant columns
text_data = df.iloc[:, -2:]
text_data.head(8)

In [None]:
# Checking imbalance
print(text_data['Sentiment'].value_counts())

## 2. Text Preprocessing

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# List of known keywords to help split compound words in hashtags or concatenated words
KEYWORDS = ['coronavirus', 'vaccine', 'lockdown', 'outbreak', 'airline', 'webcheckin']

# Function to split compound words using known keywords
def split_compound_words(text):
    for kw in KEYWORDS:
        text = re.sub(f'({kw})([a-z]+)', r'\1 \2', text)
    return text

def text_preprocessing(text):
    # Convert all text to lowercase to standardize words
    text = text.lower()

    # Remove words followed by a colon
    text = re.sub(r'\w+:','', text)
    # Remove hashtags symbol (#) but keep the word following it
    text = re.sub(r'#(\w+)', r'\1', text)
    # Split known compound words in hashtags or concatenated words (like 'coronavirusoutbreak')
    text = split_compound_words(text)
    # Replace all mentions (@username) with a placeholder 'MENTION'
    text = re.sub(r'@\w+', 'MENTION', text)
    # Remove URLs and links from the text
    text = re.sub(r'https\S+|www\S+|\/\/t\.co/\S+', '', text)
    # Remove any text inside parentheses
    text = re.sub(r'\([^)]*\)','', text)
    # Replace all numbers with a placeholder '<NUM>'
    text = re.sub(r'\d+', ' <NUM> ', text)
    # Remove punctuation characters to simplify text
    text = re.sub(r'[.,!?;:&$|=]', ' ', text)
    # Replace multiple spaces with a single space and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()


    # Lemmatize each token
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    text = ' '.join(tokens)
    
    return text

# Apply preprocessing
text_data['CleanedTweet'] = text_data['OriginalTweet'].apply(text_preprocessing)

## 3. Train/Test Split

In [None]:
X = text_data['CleanedTweet']
y = text_data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 4. Label Encoding

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

## 5. TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    max_features=3000,
    min_df=3,
    max_df=0.85
)


X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

### Optional: Inspect features of a sample tweet

In [None]:
feature_names = tfidf.get_feature_names_out()
first_row = X_train_vec[3].toarray()[0]
non_zero_indices = first_row.nonzero()[0]

for idx in non_zero_indices:
    print(feature_names[idx], first_row[idx])

## 6. Train Logistic Regression Model

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_vec, y_train_enc)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Applying TruncatedSVD for dimensionality reduction
svd = TruncatedSVD(n_components=400, random_state=42)
X_train_vec_svd = svd.fit_transform(X_train_res)
X_test_vec_svd = svd.transform(X_test_vec)

In [None]:
from sklearn.metrics import balanced_accuracy_score
models = {
    'LogisticRegression': LogisticRegression(C = 0.5,max_iter=1000, class_weight='balanced',solver='lbfgs',random_state=42),
    'Naive Bayes': MultinomialNB(),
    'Linear SVC': LinearSVC(C = 0.5,class_weight='balanced', random_state=42)
}

for label, model in models.items():
    print(f"\nTraining Model With {label}....")

    if label == "Naive Bayes":
        model.fit(X_train_vec, y_train_enc)
        y_pred = model.predict(X_test_vec)
    else:
        model.fit(X_train_vec_svd, y_train_res)
        y_pred = model.predict(X_test_vec_svd)

    print(f"\nResults for {label}:")
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
    print(f"Balanced Accuracy ({label}):", balanced_accuracy_score(y_test_enc, y_pred))



## 7. Evaluate Model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.5, 1, 2, 5]}
grid = GridSearchCV(LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42),
                    param_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1)
grid.fit(X_train_vec, y_train_enc)
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)


## 8. Retrain final model with C=5

In [None]:
final_model = LogisticRegression(C=5, max_iter=2000, class_weight='balanced', random_state=42)
final_model.fit(X_train_vec, y_train_enc)
y_pred = final_model.predict(X_test_vec)

print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
print("Balanced Accuracy:", balanced_accuracy_score(y_test_enc, y_pred))
