In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
X = pd.read_csv("../data/X_features.csv")
y = pd.read_csv("../data/y_labels.csv").squeeze()

X.shape, y.shape


((500, 106), (500,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y   # VERY IMPORTANT for sentiment balance
)


In [4]:
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",   # fixes hate→positive issue
    n_jobs=-1
)

model.fit(X_train, y_train)


In [5]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        39
     Neutral       1.00      1.00      1.00        21
    Positive       1.00      1.00      1.00        40

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [6]:
print(confusion_matrix(y_test, y_pred))


[[39  0  0]
 [ 0 21  0]
 [ 0  0 40]]


In [7]:
with open("../models/logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Logistic Regression model saved")


✅ Logistic Regression model saved


In [8]:
test = ["I hate this product", "worst experience ever"]
vectorizer = pickle.load(open("../models/tfidf_vectorizer.pkl", "rb"))
model = pickle.load(open("../models/logistic_model.pkl", "rb"))
model.predict(vectorizer.transform(test))




array(['Negative', 'Negative'], dtype=object)