In [3]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download the 'wordnet' resource for NLTK
import nltk
nltk.download('wordnet') # This line downloads the necessary resource

# Define stopwords manually to avoid downloading NLTK resources
STOPWORDS = set(["the", "is", "in", "it", "and", "to", "a", "i", "this", "that", "of", "for", "on", "with", "as", "was", "but", "at", "by", "an"])

def preprocess_text(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization (splitting by spaces)
    tokens = text.split()
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in STOPWORDS]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return ' '.join(lemmatized_tokens)

# Load dataset
try:
    data = pd.read_csv('https://raw.githubusercontent.com/YashiGarg016/Mini-Project/refs/heads/main/YoutubeCommentsDataSet.csv')
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Debugging: Print column names and first few rows
print("Column names:", data.columns)
print("First few rows:\n", data.head())

# Clean and standardize column names
data.columns = data.columns.str.strip().str.lower()  # Remove spaces & convert to lowercase

# Ensure 'comment' and 'sentiment' columns exist
if 'comment' not in data.columns or 'sentiment' not in data.columns:
    raise KeyError("'comment' or 'sentiment' column not found in dataset.")

# Drop missing comments
data = data.dropna(subset=['comment'])

# Preprocess the text data and extract labels
texts = [preprocess_text(text) for text in data['comment']]
labels = data['sentiment'].tolist()

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a Maximum Entropy (Logistic Regression) model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

def predict_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return prediction

# Example user input
user_input = "I love this product, it's amazing!"
sentiment = predict_sentiment(user_input)
print(f"User input: '{user_input}'")
print(f"Predicted sentiment: {sentiment}")

user_input = "The product is terrible."
sentiment = predict_sentiment(user_input)
print(f"User input: '{user_input}'")
print(f"Predicted sentiment: {sentiment}")

[nltk_data] Downloading package wordnet to /root/nltk_data...


Dataset loaded successfully.
Column names: Index(['Comment', 'Sentiment'], dtype='object')
First few rows:
                                              Comment Sentiment
0  lets not forget that apple pay in 2014 require...   neutral
1  here in nz 50 of retailers don’t even have con...  negative
2  i will forever acknowledge this channel with t...  positive
3  whenever i go to a place that doesn’t take app...  negative
4  apple pay is so convenient secure and easy to ...  positive
              precision    recall  f1-score   support

    negative       0.64      0.28      0.39       441
     neutral       0.65      0.60      0.62       912
    positive       0.81      0.92      0.86      2320

    accuracy                           0.76      3673
   macro avg       0.70      0.60      0.63      3673
weighted avg       0.75      0.76      0.75      3673

User input: 'I love this product, it's amazing!'
Predicted sentiment: positive
User input: 'The product is terrible.'
Predicted senti