In [56]:
import kagglehub
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk

In [57]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ashutosh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ashutosh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashutosh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ashutosh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [58]:
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
print("Path to dataset files:", path)

Path to dataset files: /home/ashutosh/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9


In [59]:
train_path = os.path.join(path, 'train.csv')
train_data = pd.read_csv(train_path, encoding='ISO-8859-1')

In [60]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [61]:
print("Preprocessing texts...")
train_data['processed_text'] = train_data['text'].apply(preprocess_text)

Preprocessing texts...


In [62]:
train_data['processed_text'] = train_data['text'].apply(preprocess_text)

In [63]:
print("\nFirst few rows of preprocessed text:")
print(train_data[['text', 'processed_text']].head())


First few rows of preprocessed text:
                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on t...   

                           processed_text  
0                      id responded going  
1                 sooo sad miss san diego  
2                            bos bullying  
3                   interview leave alone  
4  son couldnt put release already bought  


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [65]:
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_data['processed_text'])

Creating TF-IDF features...


In [66]:
unique_words = vectorizer.get_feature_names_out()
print(f"\nNumber of unique words (features): {len(unique_words)}")


Number of unique words (features): 5000


In [67]:
y = train_data['sentiment']

In [68]:
print("\nSplitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



Splitting data into train and test sets...


In [69]:
print("Training the model...")
model = MultinomialNB()
model.fit(X_train, y_train)

Training the model...


In [70]:
print("Making predictions...")
y_pred = model.predict(X_test)

Making predictions...


In [71]:
print("\nModel Evaluation:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Evaluation:
Accuracy: 0.64

Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.50      0.59      1562
     neutral       0.56      0.76      0.64      2230
    positive       0.72      0.60      0.66      1705

    accuracy                           0.64      5497
   macro avg       0.67      0.62      0.63      5497
weighted avg       0.66      0.64      0.63      5497



In [72]:
def predict_sentiment(text):
    processed = preprocess_text(text)
    features = vectorizer.transform([processed])
    return model.predict(features)[0]


In [73]:
sample_text = "I really love this product!"
prediction = predict_sentiment(sample_text)
print(f"\nSample prediction for '{sample_text}': {prediction}")


Sample prediction for 'I really love this product!': positive
