In [6]:
# disregard this, will delete later
# import sys
# print(sys.executable)
# print(sys.path[:2])   # just to see where itâ€™s looking


# COVID-19 Tweet Sentiment Analysis
# 1. Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:

# 2. Load data and inspect
df = pd.read_csv('Corona_NLP_train.csv', encoding='ISO-8859-1')
print(df.shape)
print(df['Sentiment'].value_counts())

(41157, 6)
Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64


In [8]:
# 3. Preprocessing: remove URLs, punctuation, @'s to clean up the tweet
def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)     
    text = re.sub(r"@\w+", "", text)                 
    text = re.sub(r"[^a-z0-9\s]", "", text)          
    return text.strip()

df['clean'] = df['OriginalTweet'].astype(str).apply(clean_tweet)

In [9]:

# 4. Map string labels to numerics, to make it easier to categorize
label_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}
df['label'] = df['Sentiment'].map(label_map)

In [10]:
# 5. Validation testing
X = df['clean']
y = df['label']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# 6. Build & train a pipeline to fit the training data
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=15000)),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs',
                               class_weight='balanced', max_iter=1000, random_state=42))
])
pipeline.fit(X_train, y_train)  


In [None]:
# 6.5 creating seperate pipelines for Logisitc Regression and Naive Bayes
pipelines = {
    'Logistic Regression': Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=15000)),
        ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs',
                                   class_weight='balanced', max_iter=1000, random_state=42))
    ]),
    'Naive Bayes': Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=15000)),
        ('clf', MultinomialNB())
    ])
}

# 7.5 Evauluate both pipelines
for name, pipeline in pipelines.items():
    print(f"\n=== {name} ===")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=label_map.keys()))

    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_map.keys(), yticklabels=label_map.keys())
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()

    mse = mean_squared_error(y_val, y_pred)
    print(f"Validation MSE: {mse:.4f}")

NameError: name 'MultinomialNB' is not defined

In [None]:
# 7. Evaluate using the validation set
y_pred = pipeline.predict(X_val)
print("Classification Report:\n")
print(classification_report(y_val, y_pred, target_names=label_map.keys()))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 8. Compute Mean Squared Error
mse = mean_squared_error(y_val, y_pred)
print(f"Validation MSE (ordinal labels): {mse:.4f}")