In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import pickle

# Load the excellent binary sentiment dataset
print("Loading sentiment_3class.csv...")
df = pd.read_csv("sentiment_3class.csv", nrows=200000)  # Use 200K samples for faster training

print(f"Total samples: {len(df):,}")
print("\nLabel distribution:")
print(df['label'].value_counts())

Loading sentiment_3class.csv...
Total samples: 200,000

Label distribution:
label
positive    101166
negative     98834
Name: count, dtype: int64
Total samples: 200,000

Label distribution:
label
positive    101166
negative     98834
Name: count, dtype: int64


In [2]:
# Map labels to numeric IDs
label2id = {"negative": 0, "positive": 1}
id2label = {0: "negative", 1: "positive"}

df["label_id"] = df["label"].map(label2id)

# Split features and labels
X = df["text"]
y = df["label_id"]

print("Labels mapped successfully!")
print(f"Unique label IDs: {sorted(y.unique())}")

Labels mapped successfully!
Unique label IDs: [np.int64(0), np.int64(1)]


In [3]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")

Training samples: 160,000
Test samples: 40,000


In [4]:
# Convert text to TF-IDF features
print("Vectorizing text with TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_):,}")

Vectorizing text with TF-IDF...
TF-IDF shape: (160000, 5000)
Vocabulary size: 5,000


In [5]:
# Train Binary XGBoost model
print("Training Binary Sentiment XGBoost model...")

model = xgb.XGBClassifier(
    objective='binary:logistic',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    early_stopping_rounds=10,
    verbosity=1
)

model.fit(
    X_train_tfidf, y_train,
    eval_set=[(X_test_tfidf, y_test)],
    verbose=True
)

print("\nâœ… Training completed!")

Training Binary Sentiment XGBoost model...
[0]	validation_0-logloss:0.67285
[0]	validation_0-logloss:0.67285
[1]	validation_0-logloss:0.65708
[1]	validation_0-logloss:0.65708
[2]	validation_0-logloss:0.64079
[2]	validation_0-logloss:0.64079
[3]	validation_0-logloss:0.62866
[3]	validation_0-logloss:0.62866
[4]	validation_0-logloss:0.61617
[4]	validation_0-logloss:0.61617
[5]	validation_0-logloss:0.60570
[5]	validation_0-logloss:0.60570
[6]	validation_0-logloss:0.59596
[6]	validation_0-logloss:0.59596
[7]	validation_0-logloss:0.58745
[7]	validation_0-logloss:0.58745
[8]	validation_0-logloss:0.57949
[8]	validation_0-logloss:0.57949
[9]	validation_0-logloss:0.57210
[9]	validation_0-logloss:0.57210
[10]	validation_0-logloss:0.56559
[10]	validation_0-logloss:0.56559
[11]	validation_0-logloss:0.55977
[11]	validation_0-logloss:0.55977
[12]	validation_0-logloss:0.55363
[12]	validation_0-logloss:0.55363
[13]	validation_0-logloss:0.54766
[13]	validation_0-logloss:0.54766
[14]	validation_0-logloss

In [6]:
# Evaluate model
from sklearn.metrics import precision_recall_fscore_support

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print("ðŸ“Š Binary Sentiment Model Performance")
print("=" * 40)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:")
print("         Neg  Pos")
for i, row in enumerate(cm):
    print(f"{id2label[i]:8} {row[0]:4} {row[1]:4}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

ðŸ“Š Binary Sentiment Model Performance
Accuracy:  0.8649
Precision: 0.8652
Recall:    0.8649
F1-score:  0.8648

Confusion Matrix:
         Neg  Pos
negative 16729 3038
positive 2367 17866

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.85      0.86     19767
    positive       0.85      0.88      0.87     20233

    accuracy                           0.86     40000
   macro avg       0.87      0.86      0.86     40000
weighted avg       0.87      0.86      0.86     40000



In [7]:
# Test with examples
def predict_binary(text):
    text_tfidf = vectorizer.transform([text])
    pred_id = model.predict(text_tfidf)[0]
    pred_proba = model.predict_proba(text_tfidf)[0]
    return {
        'label': id2label[pred_id],
        'confidence': float(pred_proba[pred_id]),
        'negative': float(pred_proba[0]),
        'positive': float(pred_proba[1])
    }

test_texts = [
    "I absolutely love this! Best purchase ever!",
    "Terrible product. Complete waste of money.",
    "Great quality and fast shipping!",
    "Disappointed. Would not recommend."
]

print("ðŸ”® Testing Binary Sentiment Model:\n")
for text in test_texts:
    result = predict_binary(text)
    print(f"Text: {text}")
    print(f"Prediction: {result['label'].upper()} ({result['confidence']:.1%})")
    print(f"  Negative: {result['negative']:.1%} | Positive: {result['positive']:.1%}")
    print("-" * 60)

ðŸ”® Testing Binary Sentiment Model:

Text: I absolutely love this! Best purchase ever!
Prediction: POSITIVE (92.7%)
  Negative: 7.3% | Positive: 92.7%
------------------------------------------------------------
Text: Terrible product. Complete waste of money.
Prediction: NEGATIVE (97.7%)
  Negative: 97.7% | Positive: 2.3%
------------------------------------------------------------
Text: Great quality and fast shipping!
Prediction: POSITIVE (91.0%)
  Negative: 9.0% | Positive: 91.0%
------------------------------------------------------------
Text: Disappointed. Would not recommend.
Prediction: NEGATIVE (96.2%)
  Negative: 96.2% | Positive: 3.8%
------------------------------------------------------------


In [8]:
# Save the model
model.save_model('binary_sentiment_model.json')

with open('binary_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('binary_mappings.pkl', 'wb') as f:
    pickle.dump({'label2id': label2id, 'id2label': id2label}, f)

print("âœ… Binary Sentiment Model saved successfully!")
print("   - binary_sentiment_model.json")
print("   - binary_vectorizer.pkl")
print("   - binary_mappings.pkl")
print("\nðŸŽ‰ Model is now ready to use in the app!")

âœ… Binary Sentiment Model saved successfully!
   - binary_sentiment_model.json
   - binary_vectorizer.pkl
   - binary_mappings.pkl

ðŸŽ‰ Model is now ready to use in the app!
