In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
data = pd.read_csv('YoutubeCommentsDataSet.csv')

# Ensure correct column names
comment_col = 'comment' if 'comment' in data.columns else data.columns[0]
label_col = 'label' if 'label' in data.columns else data.columns[1]

# Drop rows with missing values in comment or label
data = data.dropna(subset=[comment_col, label_col])

# Check unique labels
print("Unique labels in dataset:", data[label_col].unique())

# Convert labels to integers (Binary classification: Positive=1, Negative=0)
label_mapping = {'positive': 1, 'negative': 0}  # Ignore 'neutral' or other labels
data = data[data[label_col].isin(label_mapping)]  # Keep only positive & negative comments
data[label_col] = data[label_col].map(label_mapping)  # Apply mapping

# Extract features (comments) and labels
X = data[comment_col].astype(str).tolist()  # Convert to list of strings
y = data[label_col].tolist()  # Labels are already integers

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bag-of-Words representation of the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_vec)

# Generate classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Function to predict sentiment for user input
def predict_sentiment(text):
    text_vec = vectorizer.transform([text])
    prediction = classifier.predict(text_vec)[0]
    return "Positive" if prediction == 1 else "Negative"

# User input example
user_input = input("Enter a comment: ")
sentiment = predict_sentiment(user_input)
print(f"The sentiment of the comment is: {sentiment}")


Unique labels in dataset: ['neutral' 'negative' 'positive']
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.42      0.55       461
           1       0.89      0.98      0.94      2287

    accuracy                           0.89      2748
   macro avg       0.85      0.70      0.74      2748
weighted avg       0.88      0.89      0.87      2748



Enter a comment:  i hate this video


The sentiment of the comment is: Positive
