In [1]:
import os
from Pre_Processing_Function import preprocess_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Define the paths to the positive and negative reviews
pos_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Review\review_polarity\txt_sentoken\pos'
neg_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Review\review_polarity\txt_sentoken\neg'

# Extract Texts from Folders
def read_files_from_directory(reviews_path):
    reviews = []
    for filename in os.listdir(reviews_path):
        if filename.endswith(".txt"):
            with open(os.path.join(reviews_path, filename), 'r') as file:
                reviews.append(file.read())
    return reviews

In [3]:
# Read the positive and negative reviews
original_pos_reviews = read_files_from_directory(pos_reviews_path)
original_neg_reviews = read_files_from_directory(neg_reviews_path)

# Preprocess the reviews
positive_reviews = preprocess_text(original_pos_reviews)
negative_reviews = preprocess_text(original_neg_reviews)

In [4]:
# Concatenate the list of reviews into a single string
all_reviews = [' '.join(review) for review in positive_reviews + negative_reviews]

# Create labels for the reviews
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

# TF-IDF feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(all_reviews)
y = labels

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [6]:
# Define the parameter grid to search over for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
    'p': [1, 2]  # Power parameter for the Minkowski metric
}

# Create a K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier()

# Initialize GridSearchCV with the KNN classifier and parameter grid
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_model_accuracy = best_model.score(X_test, y_test)



In [7]:
print("Best Hyperparameters for KNN are:", best_params)
print("Best Model Accuracy:", best_model_accuracy)

Best Hyperparameters for KNN are: {'algorithm': 'auto', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Best Model Accuracy: 0.6766666666666666
