# Logistic Regression

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import joblib

In [90]:
# Load the dataset
df = pd.read_csv("Preprocess_Reviews.csv")

In [91]:
# Handle missing values
df.dropna(inplace=True)

In [92]:
df_Positive = df[df['sentiment'] == 'positive'][0:20000]
df_Neutral = df[df['sentiment'] == 'neutral'][0:20000]
df_Negative = df[df['sentiment'] == 'negative'][0:20000]

In [93]:
df = pd.concat([df_Positive, df_Neutral, df_Negative], axis=0)

In [94]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['Reviews'])

In [95]:
# Label Encoding for Sentiment
y = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

In [96]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
# Logistic Regression Model with Hyperparameter Tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers
    'max_iter': [100, 200, 300]  # Number of iterations
}

In [98]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [99]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 10, 'max_iter': 200, 'solver': 'saga'}


In [100]:
# Best estimator
best_model = grid_search.best_estimator_

In [101]:
# Predictions
y_pred = best_model.predict(X_test)

In [102]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8395
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85      4023
     neutral       0.80      0.77      0.78      4014
    positive       0.88      0.90      0.89      3963

    accuracy                           0.84     12000
   macro avg       0.84      0.84      0.84     12000
weighted avg       0.84      0.84      0.84     12000



In [104]:
# Save the best model to a file
joblib_file = "logistic_regression_model.joblib"
joblib.dump(best_model, joblib_file)
print(f"Model saved to {joblib_file}")

Model saved to logistic_regression_model.joblib


In [107]:
# Load the model from the file
loaded_model = joblib.load("logistic_regression_model.joblib")
print("Model loaded successfully")


Model loaded successfully


In [108]:
# Predictions with the loaded model
y_pred = loaded_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8395
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85      4023
     neutral       0.80      0.77      0.78      4014
    positive       0.88      0.90      0.89      3963

    accuracy                           0.84     12000
   macro avg       0.84      0.84      0.84     12000
weighted avg       0.84      0.84      0.84     12000

