In [2]:
import os
import sys
sys.path.append(r'D:\Gethub\Sentiment-Analysis-of-Movie-Reviews')
from Pre_Processing import preprocess_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Extract Texts from Folders
def read_reviews(reviews_path):
    reviews = []
    for filename in os.listdir(reviews_path):
        if filename.endswith(".txt"):
            with open(os.path.join(reviews_path, filename), 'r', encoding='ISO-8859-1') as file:
                reviews.append(file.read())
    return reviews

In [3]:
# Define the paths to the positive and negative reviews
pos_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Reviews\Augmented_Dataset\pos'
neg_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Reviews\Augmented_Dataset\neg'

# Read the positive and negative reviews
original_pos_reviews = read_reviews(pos_reviews_path)
original_neg_reviews = read_reviews(neg_reviews_path)


# Preprocess the reviews
positive_reviews = preprocess_text(original_pos_reviews)
negative_reviews = preprocess_text(original_neg_reviews)

In [4]:
# Concatenate the list of reviews into a single string
all_reviews = [' '.join(review) for review in positive_reviews + negative_reviews]


# Create labels for the reviews
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)


# TF-IDF feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(all_reviews)
y = labels

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)


In [6]:
# Define the parameter grid to search over
param_grid = {
    'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy']      # Function to measure the quality of a split
}


# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV with the Decision Tree classifier and parameter grid
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_model_accuracy = best_model.score(X_test, y_test)


In [7]:
print("Best Hyperparameters for Decision Tree:", best_params)
print("Best Model Accuracy:", best_model_accuracy)

Best Hyperparameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Model Accuracy: 0.6283333333333333
