In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Load dataset
file_path = "/content/Corona_NLP_train.csv"
df = pd.read_csv(file_path)


In [None]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions
    text = re.sub(r'#\w+', '', text) # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers
    text = " ".join([word for word in text.split() if len(word) > 2]) # Remove short words
    return text

df["CleanedTweet"] = df["OriginalTweet"].astype(str).apply(preprocess_text)


In [None]:
# Word Cloud Visualization
all_text = " ".join(df["CleanedTweet"])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Tweets")
plt.show()


In [None]:
# Bag of Words (BoW) Visualization
vectorizer = CountVectorizer(max_features=20)
X_bow = vectorizer.fit_transform(df["CleanedTweet"])
word_counts = X_bow.toarray().sum(axis=0)
words = vectorizer.get_feature_names_out()
word_freq = dict(zip(words, word_counts))
plt.figure(figsize=(10, 5))
plt.bar(words, word_counts, color='skyblue')
plt.xticks(rotation=45)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Top 20 Words in Bag of Words Representation")
plt.show()


In [None]:
# Convert labels to numerical values
sentiment_mapping = {
    "Positive": 1,
    "Negative": -1,
    "Neutral": 0,
    "Extremely Positive": 2,
    "Extremely Negative": -2
}
y = df["Sentiment"].map(sentiment_mapping)


In [None]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf_vectorizer.fit_transform(df["CleanedTweet"])


In [None]:
# Dimensionality reduction using Truncated SVD
svd = TruncatedSVD(n_components=300, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X_train, y_train)
# Predictions
y_pred = knn.predict(X_test)


In [None]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', report)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=sentiment_mapping.keys(),
            yticklabels=sentiment_mapping.keys())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
