In [7]:
import os
from Pre_Processing_Function import preprocess_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
# Define the paths to the positive and negative reviews
pos_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Review\review_polarity\txt_sentoken\pos'
neg_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Review\review_polarity\txt_sentoken\neg'

# Extract Texts from Folders
def read_files_from_directory(reviews_path):
    reviews = []
    for filename in os.listdir(reviews_path):
        if filename.endswith(".txt"):
            with open(os.path.join(reviews_path, filename), 'r') as file:
                reviews.append(file.read())
    return reviews

In [9]:
# Read the positive and negative reviews
original_pos_reviews = read_files_from_directory(pos_reviews_path)
original_neg_reviews = read_files_from_directory(neg_reviews_path)

# Preprocess the reviews
positive_reviews = preprocess_text(original_pos_reviews)
negative_reviews = preprocess_text(original_neg_reviews)

In [10]:
# Concatenate the list of reviews into a single string
all_reviews = [' '.join(review) for review in positive_reviews + negative_reviews]

# Create labels for the reviews
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

# TF-IDF feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(all_reviews)
y = labels

In [11]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [12]:
# Define the parameter grid to search over for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400]
}

# Create a Logistic Regression classifier
logistic_regression = LogisticRegression(random_state=42)

# Initialize GridSearchCV with the Logistic Regression classifier and parameter grid
grid_search = GridSearchCV(estimator=logistic_regression, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_model_accuracy = best_model.score(X_test, y_test)



In [13]:
print("Best Hyperparameters for Logistic Regression are:", best_params)
print("Best Model Accuracy:", best_model_accuracy)

Best Hyperparameters for Logistic Regression are: {'C': 100, 'max_iter': 100, 'solver': 'newton-cg'}
Best Model Accuracy: 0.8433333333333334
