In [3]:
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
MODEL_FILE = "imdb_model.pkl"
PIPELINE_FILE = "imdb_pipeline.pkl"
DATA_FILE = "IMDB_dataset.csv"


# Build Pipeline

def build_pipeline():
    return Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english", max_features=10000)),
        ("model", LogisticRegression(max_iter=1000, random_state=42))
    ])


# Train Model

def train():
    df = pd.read_csv(DATA_FILE)
    df.columns = df.columns.str.lower().str.strip()
    df = df.dropna(subset=["review", "sentiment"])

    df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

    X = df["review"]
    y = df["sentiment"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    pipeline = build_pipeline()
    pipeline.fit(X_train, y_train)

    joblib.dump(pipeline, PIPELINE_FILE)
    joblib.dump(pipeline.named_steps["model"], MODEL_FILE)

    X_test.to_csv("input.csv", index=False)

    print("✅ Model trained and saved successfully!")



# Inference

def inference(input_csv="input.csv"):
    if not os.path.exists(input_csv):
        raise FileNotFoundError("input.csv not found!")

    pipeline = joblib.load(PIPELINE_FILE)
    data = pd.read_csv(input_csv)

    predictions = pipeline.predict(data["review"])
    data["predicted_sentiment"] = pd.Series(predictions).map({
        1: "Positive", 
        0: "Negative"})

    data.to_csv("output.csv", index=False)
    print("✅ Inference completed. Output saved to output.csv")


# Entry Point

if __name__ == "__main__":
    if not os.path.exists(MODEL_FILE):
        train()
    else:
        inference()

✅ Inference completed. Output saved to output.csv
