In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('../../../datasets/train.csv')
test_df = pd.read_csv('../../../datasets/test.csv')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=8000)

train_df = train_df.dropna(subset=['cleaned_review', 'Sentiment'])
test_df = test_df.dropna(subset=['cleaned_review', 'Sentiment'])

X_train = tfidf.fit_transform(train_df['cleaned_review'])
X_test = tfidf.transform(test_df['cleaned_review'])

y_train = train_df['Sentiment']
y_test = test_df['Sentiment']

In [71]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

lr_predictions = lr_model.predict(X_test)

In [72]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

accuracy = accuracy_score(y_test, lr_predictions)
classify_report = classification_report(y_test, lr_predictions)
conf_matrix = confusion_matrix(y_test, lr_predictions)

print(f"Accuracy: {accuracy}\n")
print(f"Classification Report:\n {classify_report}\n")
print(f"Confusion Matrix:\n {conf_matrix}")


Accuracy: 0.8233974358974359

Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.83      0.82      1560
    positive       0.83      0.82      0.82      1560

    accuracy                           0.82      3120
   macro avg       0.82      0.82      0.82      3120
weighted avg       0.82      0.82      0.82      3120


Confusion Matrix:
 [[1291  269]
 [ 282 1278]]


In [73]:
from sklearn.naive_bayes import MultinomialNB

mnb_model = MultinomialNB()

mnb_model.fit(X_train, y_train)

In [74]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

mnb_predictions = mnb_model.predict(X_test)

accuracy = accuracy_score(y_test, mnb_predictions)

classify_report = classification_report(y_test, mnb_predictions)

conf_matrix = confusion_matrix(y_test, mnb_predictions)

print(f"Accuracy: {accuracy}\n")
print(f"Classification Report:\n {classify_report}\n")
print(f"Confusion Matrix:\n {conf_matrix}")


Accuracy: 0.8330128205128206

Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.86      0.84      1560
    positive       0.85      0.81      0.83      1560

    accuracy                           0.83      3120
   macro avg       0.83      0.83      0.83      3120
weighted avg       0.83      0.83      0.83      3120


Confusion Matrix:
 [[1338  222]
 [ 299 1261]]


In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

rf_predictions = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, rf_predictions)
classify_report = classification_report(y_test, rf_predictions)
conf_matrix = confusion_matrix(y_test, rf_predictions)

print(f"Accuracy: {accuracy}\n")
print(f"Classification Report:\n {classify_report}\n")
print(f"Confsion Matrix:\n {conf_matrix}")


Accuracy: 0.8006410256410257

Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.83      0.81      1560
    positive       0.82      0.77      0.79      1560

    accuracy                           0.80      3120
   macro avg       0.80      0.80      0.80      3120
weighted avg       0.80      0.80      0.80      3120


Confsion Matrix:
 [[1296  264]
 [ 358 1202]]


In [22]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # This creates [1, 0] for ['positive', 'negative']
y_test_enc = le.transform(y_test)

from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train_enc)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [29]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred_xgb = xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test_enc, y_pred_xgb))
print(confusion_matrix(y_test_enc, y_pred_xgb))
print(classification_report(y_test_enc, y_pred_xgb, target_names=le.classes_))

Accuracy: 0.7897435897435897
[[1231  329]
 [ 327 1233]]
              precision    recall  f1-score   support

    negative       0.79      0.79      0.79      1560
    positive       0.79      0.79      0.79      1560

    accuracy                           0.79      3120
   macro avg       0.79      0.79      0.79      3120
weighted avg       0.79      0.79      0.79      3120

