In [9]:
#using the latest version of Scikit
!pip install -U scikit-learn==1.7.0




In [2]:
#importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Loading  balanced dataset
df_original = pd.read_csv('/content/balanced.csv')

# Loading the generated negation data from CSV
df_generated = pd.read_csv('/content/balanced_positive_negative_phrases.csv')

# Checking if the column names are the same (they should be 'Review' and 'Sentiment')
print(df_original.columns)
print(df_generated.columns)

# Merging the two datasets
df_combined = pd.concat([df_original, df_generated], ignore_index=True)

# Shuffling the data to mix generated and original examples
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Saving to a new CSV file
df_combined.to_csv('/content/merged_dataset.csv', index=False)

print("Merged dataset shape:", df_combined.shape)



Index(['Review', 'Sentiment'], dtype='object')
Index(['Review', 'Sentiment'], dtype='object')
Merged dataset shape: (174074, 2)


In [4]:
print(df_generated['Sentiment'].value_counts())

Sentiment
Negative    5000
Positive    5000
Name: count, dtype: int64


In [5]:
def preprocess(text):
    text = text.lower()

    # Removing punctuation (preserve underscores)
    text = text.translate(str.maketrans('', '', string.punctuation.replace('_', '')))

    # Removing stopwords (keep "not")
    words = text.split()
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)

# Apply preprocessing
df_combined['clean_review'] = df_combined['Review'].apply(preprocess)

# ------------------ MODEL TRAINING ------------------ #
# Features and labels
X = df_combined['clean_review']
y = df_combined['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer with unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.95)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# ------------------ EVALUATION ------------------ #
# Predict
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ------------------ SAMPLE TESTS ------------------ #
samples = [
    "not good",

    "very helpful",
    "not working",
    "Waaay too small. Will use for future children!",
    "cocoanut water i must say i am not too thrilled."
]

for text in samples:
    cleaned = preprocess(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)
    print(f"'{text}' → {pred[0]}")


Accuracy: 0.9176504380295849

Classification Report:
               precision    recall  f1-score   support

    Negative       0.89      0.95      0.92     17255
    Positive       0.94      0.89      0.92     17560

    accuracy                           0.92     34815
   macro avg       0.92      0.92      0.92     34815
weighted avg       0.92      0.92      0.92     34815

'not good' → Negative
'very helpful' → Positive
'not working' → Negative
'Waaay too small. Will use for future children!' → Negative
'cocoanut water i must say i am not too thrilled.' → Negative


In [6]:
import pickle

# Saving trained model
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Saving vectorizer
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


In [7]:
# downloading files
from google.colab import files
files.download('sentiment_model.pkl')
files.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
#checking the version of scikit
import sklearn
print("scikit-learn version:", sklearn.__version__)


scikit-learn version: 1.7.0
