In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import os
import joblib

In [3]:
MODEL_FILE = "imdb_model.pkl"
PIPELINE_FILE = "imdb_pipeline.pkl"

def build_pipeline():
    text_pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
        ("log_reg", LogisticRegression(max_iter=1000, random_state=42))
    ])
    return text_pipeline


if not os.path.exists(MODEL_FILE):
    imdb = pd.read_csv("IMDB_dataset.csv")

    imdb.columns = imdb.columns.str.lower().str.strip()

    imdb = imdb.dropna(subset=["review", "sentiment"])

    # Convert sentiment to binary (positive=1, negative=0)
    imdb["sentiment"] = imdb["sentiment"].apply(lambda x: 1 if str(x).strip().lower() == "positive" else 0)

    train_set, test_set = train_test_split(imdb, test_size=0.2, stratify=imdb["sentiment"], random_state=42)
    test_set.to_csv("input.csv", index=False)   

    X_train = train_set["review"]
    y_train = train_set["sentiment"]

    # Build pipeline
    pipeline = build_pipeline()

    # Fit model
    pipeline.fit(X_train, y_train)

    # Save model and pipeline
    joblib.dump(pipeline, PIPELINE_FILE)
    joblib.dump(pipeline.named_steps["log_reg"], MODEL_FILE)

    print(" Model trained successfully! Files saved: imdb_model.pkl & imdb_pipeline.pkl")

else:
    # Perform inference using saved model
    print(" Model already exists â€” running inference...")

    # Load model & pipeline
    pipeline = joblib.load(PIPELINE_FILE)
    model = joblib.load(MODEL_FILE)

    # Load test data (input)
    input_data = pd.read_csv("input.csv")

    # Transform & predict
    predictions = pipeline.predict(input_data["review"])

    # Add predictions to DataFrame
    input_data["predicted_sentiment"] = predictions
    input_data["predicted_sentiment"] = input_data["predicted_sentiment"].map({1: "Positive", 0: "Negative"})

    # Save output
    input_data.to_csv("output.csv", index=False)
    print(" Inference complete! Results saved in output.csv")

 Model trained successfully! Files saved: imdb_model.pkl & imdb_pipeline.pkl
