In [2]:
# Import required libraries
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Define dataset path
dataset_path = r"C:\Users\Dell\Downloads\aclImdb_v1\aclImdb"
print("Dataset path set to:", dataset_path)

# Function to load data
def load_data(data_dir):
    texts, labels = [], []
    for label_type in ['pos', 'neg']:
        dir_name = os.path.join(data_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith(".txt"):
                with open(os.path.join(dir_name, fname), 'r', encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(1 if label_type == 'pos' else 0)
    return pd.DataFrame({'review': texts, 'label': labels})

# Load training and testing datasets
print("Loading training data...")
train_data = load_data(os.path.join(dataset_path, "train"))
print("Training data loaded. Shape:", train_data.shape)

print("Loading testing data...")
test_data = load_data(os.path.join(dataset_path, "test"))
print("Testing data loaded. Shape:", test_data.shape)

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = word_tokenize(text)  # Tokenize text
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

# Preprocess training data
print("Preprocessing training data...")
train_data['cleaned_review'] = train_data['review'].apply(preprocess_text)
print("First preprocessed training review:", train_data['cleaned_review'].iloc[0])

# Preprocess testing data
print("Preprocessing testing data...")
test_data['cleaned_review'] = test_data['review'].apply(preprocess_text)
print("First preprocessed testing review:", test_data['cleaned_review'].iloc[0])

# Convert text to TF-IDF features
print("Converting text to TF-IDF features...")
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['cleaned_review']).toarray()
X_test = vectorizer.transform(test_data['cleaned_review']).toarray()
print("TF-IDF feature conversion completed.")

y_train = train_data['label']
y_test = test_data['label']

# Train the Logistic Regression model
print("Training the Logistic Regression model...")
model = LogisticRegression()
model.fit(X_train, y_train)
print("Model training completed.")

# Predict and evaluate
print("Making predictions...")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset path set to: C:\Users\Dell\Downloads\aclImdb_v1\aclImdb
Loading training data...
Training data loaded. Shape: (25000, 2)
Loading testing data...
Testing data loaded. Shape: (25000, 2)
Preprocessing training data...
First preprocessed training review: bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell high satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity
Preprocessing testing data...
First preprocessed testing review: went saw movie last night coaxed friends mine admit reluctant see knew ashton kutcher able comedy wrong kutcher played character jake fischer well kevin costner played ben r