In [1]:
# COVID-19 Tweet Sentiment Analysis

# disregard this, will delete later
# import sys
# print(sys.executable)
# print(sys.path[:2])   # just to see where it’s looking


# 1. Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


# 2. Load data and inspect
df = pd.read_csv('Corona_NLP_train.csv', encoding='ISO-8859-1')
print(df.shape)
print(df['Sentiment'].value_counts())

# 3. Preprocessing: remove URLs, punctuation, @'s to clean up the tweet
def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)     
    text = re.sub(r"@\w+", "", text)                 
    text = re.sub(r"[^a-z0-9\s]", "", text)          
    return text.strip()

df['clean'] = df['OriginalTweet'].astype(str).apply(clean_tweet)

# 4. Map string labels to numerics, to make it easier to categorize
label_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}
df['label'] = df['Sentiment'].map(label_map)

# 5. Validation testing
X = df['clean']
y = df['label']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6. Build & train a pipeline to fit the training data
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=15000)),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs',
                               class_weight='balanced', max_iter=1000, random_state=42))
])
pipeline.fit(X_train, y_train)

# 7. Evaluate using the validation set
y_pred = pipeline.predict(X_val)
print("Classification Report:\n")
print(classification_report(y_val, y_pred, target_names=label_map.keys()))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

# 8. Compute Mean Squared Error
mse = mean_squared_error(y_val, y_pred)
print(f"Validation MSE (ordinal labels): {mse:.4f}")

