In [1]:
import os
import tarfile
import shutil
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Function to read text files in batches
def read_text_files_batch(path, batch_size=1000):
    texts = []
    labels = []
    for root, _, files in os.walk(path):
        for filename in files:
            if filename.endswith('.txt'):
                with open(os.path.join(root, filename), 'r', encoding='utf-8', errors='ignore') as file:
                    texts.append(file.read())
                labels.append(1 if 'pos' in root else 0)
                if len(texts) >= batch_size:
                    yield texts, labels
                    texts, labels = [], []
    if texts:
        yield texts, labels

# Function to extract data from tar file
def extract_data(tar_path, batch_size=100):
    extract_dir = 'extracted_data'
    os.makedirs(extract_dir, exist_ok=True)
    
    try:
        print(f"Attempting to open file at: {tar_path}")
        with tarfile.open(tar_path, 'r:*') as tar:
            print("Tar file opened successfully")
            tar.extractall(path=extract_dir)
        
        base_path = os.path.join(extract_dir, 'aclImdb')
        train_path = os.path.join(base_path, 'train')
        test_path = os.path.join(base_path, 'test')
        
        return train_path, test_path
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

# Main execution
try:
    filename = 'text.tar'
    current_dir = os.getcwd()
    tar_path = os.path.join(current_dir, filename)
    
    train_path, test_path = extract_data(tar_path)

    if train_path is None or test_path is None:
        raise ValueError("Data extraction failed. Please check the file path and permissions.")

    # Create a bag of words representation
    vectorizer = CountVectorizer(max_features=5000)

    # Fit the vectorizer on all training data
    all_train_texts = []
    all_train_labels = []
    for batch_texts, batch_labels in read_text_files_batch(train_path):
        all_train_texts.extend(batch_texts)
        all_train_labels.extend(batch_labels)
    
    X_train = vectorizer.fit_transform(all_train_texts)
    
    # Train the classifier
    clf = MultinomialNB()
    clf.fit(X_train, all_train_labels)
    print("Training completed")

    # Process test data and evaluate
    all_predictions = []
    all_true_labels = []
    for i, (batch_texts, batch_labels) in enumerate(read_text_files_batch(test_path)):
        X_test_bow = vectorizer.transform(batch_texts)
        batch_predictions = clf.predict(X_test_bow)
        all_predictions.extend(batch_predictions) 
        all_true_labels.extend(batch_labels)
        print(f"Processed test batch {i+1}")

    # Evaluate the model
    accuracy = accuracy_score(all_true_labels, all_predictions)
    report = classification_report(all_true_labels, all_predictions)

    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

    # Function to classify a new review
    def classify_review(review):
        review_bow = vectorizer.transform([review])
        prediction = clf.predict(review_bow)
        return "Positive" if prediction[0] == 1 else "Negative"

    # Test the classifier with a sample review
    sample_review = "This movie was fantastic! I really enjoyed the plot and the acting was superb."
    print(f"Sample review classification: {classify_review(sample_review)}")

except Exception as e:
    print(f"An error occurred during execution: {e}")

finally:
    # Clean up: remove the temporary directory
    shutil.rmtree('extracted_data', ignore_errors=True)

Attempting to open file at: C:\Users\aniket\Downloads\text class\text.tar
Tar file opened successfully
Training completed
Processed test batch 1
Processed test batch 2
Processed test batch 3
Processed test batch 4
Processed test batch 5
Processed test batch 6
Processed test batch 7
Processed test batch 8
Processed test batch 9
Processed test batch 10
Processed test batch 11
Processed test batch 12
Processed test batch 13
Processed test batch 14
Processed test batch 15
Processed test batch 16
Processed test batch 17
Processed test batch 18
Processed test batch 19
Processed test batch 20
Processed test batch 21
Processed test batch 22
Processed test batch 23
Processed test batch 24
Processed test batch 25
Processed test batch 26
Accuracy: 0.8162147028237741
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.81     12502
           1       0.81      0.83      0.82     12500

    accuracy                           0.82    