In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['review'], df['sentiment']

def preprocess_labels(y):
    encoder = LabelEncoder()
    return encoder, encoder.fit_transform(y)

def prepare_data(X, Y, test_size=0.2, random_state=42):
    return train_test_split(X, Y, test_size=test_size, random_state=random_state)

def vectorize_text(train_text, test_text, max_features=10000):
    vectorizer = TfidfVectorizer(max_features=max_features)
    return vectorizer.fit_transform(train_text), vectorizer.transform(test_text), vectorizer

def train_model(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    return predictions

def display_predictions(predictions, true_values, label_encoder, num_samples=5):
    print("\nPredictions for the first 5 test samples:")
    for pred in predictions[:num_samples]:
        print(label_encoder.inverse_transform([pred])[0])

    print("\nTrue values for the first 5 test samples:")
    for true_val in true_values[:num_samples]:
        print(label_encoder.inverse_transform([true_val])[0])

if __name__ == "__main__":
    file_path = "/content/drive/MyDrive/IMDB Dataset.csv"

    X, Y = load_data(file_path)
    label_encoder, Y = preprocess_labels(Y)
    X_train, X_test, y_train, y_test = prepare_data(X, Y)
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)

    model = train_model(X_train_vec, y_train)

    predictions = evaluate_model(model, X_test_vec, y_test)

    display_predictions(predictions, y_test, label_encoder)

Test Accuracy: 89.92%

Predictions for the first 5 test samples:
negative
positive
negative
positive
negative

True values for the first 5 test samples:
positive
positive
negative
positive
negative
