In [34]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import download

# Download stopwords if not already downloaded
download('punkt')
download('stopwords')

# Load the dataset
url = 'https://drive.google.com/uc?id=1HWczIICsMpaL8EJyu48ZvRFcXx3_pcnb'
data = pd.read_csv(url)

# Display first few rows of the dataset
data.head()


# Check the columns and first few rows of the dataset
#print(data.columns)  # List all columns
#print(data.head())    # Display the first few rows


# Sample data cleaning function
def preprocess_text(Comment):
    # Convert text to lowercase
    text = Comment.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', Comment)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing to the text column
data['cleaned_text'] = data['Comment'].apply(preprocess_text)

# Split data into features (X) and target labels (y)
X = data['cleaned_text']
y = data['Emotion']  # Assuming the 'emotion' column holds the emotion labels


# CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
X_count = count_vectorizer.fit_transform(X)

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X)



from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Support Vector Machine Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predict using Naive Bayes
y_pred_nb = nb_model.predict(X_test)

# Predict using SVM
y_pred_svm = svm_model.predict(X_test)


# Evaluate Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')
print(f'Naive Bayes Accuracy: {accuracy_nb:.4f}')
print(f'Naive Bayes F1-Score: {f1_nb:.4f}')
print('Naive Bayes Classification Report:')
print(classification_report(y_test, y_pred_nb))

# Evaluate SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
print(f'SVM Accuracy: {accuracy_svm:.4f}')
print(f'SVM F1-Score: {f1_svm:.4f}')
print('SVM Classification Report:')
print(classification_report(y_test, y_pred_svm))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ancu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ancu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Naive Bayes Accuracy: 0.9108
Naive Bayes F1-Score: 0.9106
Naive Bayes Classification Report:
              precision    recall  f1-score   support

       anger       0.88      0.95      0.91       392
        fear       0.91      0.92      0.92       416
         joy       0.94      0.86      0.90       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188

SVM Accuracy: 0.9444
SVM F1-Score: 0.9444
SVM Classification Report:
              precision    recall  f1-score   support

       anger       0.93      0.95      0.94       392
        fear       0.97      0.92      0.94       416
         joy       0.94      0.96      0.95       380

    accuracy                           0.94      1188
   macro avg       0.94      0.95      0.94      1188
weighted avg       0.95      0.94      0.94      1188

