In [1]:
import pandas as pd

In [1]:
import mlflow
import mlflow.pytorch

In [2]:
import os
import pandas as pd
import torch
from transformers import BartForSequenceClassification, Trainer, TrainingArguments,BartTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mlflow.set_tracking_uri("http://localhost:5000")

In [4]:
import os

def get_text_from_files(root_folder):
    """
    Recursively fetch all .txt files from the root folder, read their content, and return a list of extracted text.
    
    :param root_folder: Path to the root directory
    :return: List of text content from all .txt files
    """
    text_data = []
    for dirpath, _, filenames in os.walk(root_folder):
        for file in filenames:
            if file.endswith(".txt"):
                file_path = os.path.join(dirpath, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    text_data.append(f.read())
    return text_data

### Load Train Data

In [5]:
neg_data=get_text_from_files(root_folder=r"Movie_data\train\neg")

In [6]:
len(neg_data)

12500

In [7]:
pos_data=get_text_from_files(root_folder=r"Movie_data\train\pos")

In [8]:
len(pos_data)

12500

### Load Test Data

In [18]:
neg_data=get_text_from_files(root_folder=r"Movie_data\test\neg")

In [9]:
def create_sentiment_dataframe(positive_reviews, negative_reviews):

    data = {
        "review": positive_reviews + negative_reviews,
        "sentiment": ["positive"] * len(positive_reviews) + ["negative"] * len(negative_reviews)
    }
    return pd.DataFrame(data)

In [10]:
main_train_data=create_sentiment_dataframe(pos_data,neg_data)

In [11]:
main_train_data.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Homelessness (or Houselessness as George Carli...,positive
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive
3,This is easily the most underrated film inn th...,positive
4,This is not the typical Mel Brooks film. It wa...,positive


### Data Preprocessing for Model Train

In [12]:
def preprocess_data(df):

    df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})
    
    # Ensure equal distribution of positive and negative samples in train and test sets
    df_positive = df[df["label"] == 1]
    df_negative = df[df["label"] == 0]
    
    train_pos, val_pos = train_test_split(df_positive, test_size=0.2, random_state=42)
    train_neg, val_neg = train_test_split(df_negative, test_size=0.2, random_state=42)
    
    train_df = pd.concat([train_pos, train_neg]).sample(frac=1, random_state=42).reset_index(drop=True)
    val_df = pd.concat([val_pos, val_neg]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
    
    train_encodings = tokenizer(train_df["review"].tolist(), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_df["review"].tolist(), truncation=True, padding=True, max_length=512)
    
    train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_df["label"].tolist()})
    val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"], "labels": val_df["label"].tolist()})
    
    return DatasetDict({"train": train_dataset, "test": val_dataset})

In [13]:
Processed_dataset=preprocess_data(main_train_data)



In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [15]:
print(Processed_dataset["train"]["labels"])

[0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 

In [51]:
print(Processed_dataset["label"].value_counts())

KeyError: 'label'

### Download the BERT model Locally

In [37]:
from transformers import BartForSequenceClassification, BartTokenizer

# Load and save the model locally before training
model_name = "facebook/bart-large"
model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Save locally before training


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model.save_pretrained("model/pretrained_bart")
tokenizer.save_pretrained("model/pretrained_bart")

print("Pretrained model downloaded and saved locally.")

Pretrained model downloaded and saved locally.


In [52]:
model=0

In [53]:
tokenizer=0

### Train The Model With ML FLow Tracking with Local RTX GPU

In [16]:
def train_model(dataset):
    """
    Fine-tunes the BART model for sentiment analysis with MLflow tracking.
    """
    mlflow.set_experiment("BART Sentiment Analysis")
    with mlflow.start_run():
        model_path = "model/pretrained_bart"
        model = BartForSequenceClassification.from_pretrained(model_path, num_labels=2)
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="no",  # Disabling frequent saving to reduce GPU load
            per_device_train_batch_size=2,  # Reduced from 8 to 4
            per_device_eval_batch_size=2,   # Reduced batch size
            num_train_epochs=3,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=50,  # Reduce logging frequency
            fp16=True,  # Enable mixed precision training
            gradient_accumulation_steps=2,  # Helps simulate larger batches
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"]
        )
        
        mlflow.log_params({
            "batch_size": training_args.per_device_train_batch_size,
            "epochs": training_args.num_train_epochs,
            "learning_rate": training_args.learning_rate,
            "weight_decay": training_args.weight_decay
        })
        
        trainer.train()
        
        model.save_pretrained("./fine_tuned_bart")
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
        tokenizer.save_pretrained("./fine_tuned_bart")
        
        mlflow.pytorch.log_model(model, "bart_model")
        print("Fine-tuning completed and model saved with MLflow tracking.")


In [17]:
train_model(Processed_dataset)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
  0%|          | 50/15000 [11:41<60:54:27, 14.67s/it]

{'loss': 0.6832, 'learning_rate': 4.984666666666667e-05, 'epoch': 0.01}


  1%|          | 100/15000 [23:27<60:22:03, 14.59s/it]

{'loss': 0.4059, 'learning_rate': 4.9680000000000005e-05, 'epoch': 0.02}


  1%|          | 150/15000 [33:12<45:03:23, 10.92s/it]

{'loss': 0.5769, 'learning_rate': 4.951333333333333e-05, 'epoch': 0.03}


  1%|▏         | 200/15000 [42:29<48:29:50, 11.80s/it]

{'loss': 0.6294, 'learning_rate': 4.9346666666666666e-05, 'epoch': 0.04}


  2%|▏         | 250/15000 [51:40<45:20:36, 11.07s/it]

{'loss': 0.4503, 'learning_rate': 4.918000000000001e-05, 'epoch': 0.05}


  2%|▏         | 300/15000 [1:02:48<47:40:04, 11.67s/it]

{'loss': 0.4196, 'learning_rate': 4.9013333333333334e-05, 'epoch': 0.06}


  2%|▏         | 350/15000 [1:12:29<43:49:14, 10.77s/it]

{'loss': 0.4284, 'learning_rate': 4.884666666666667e-05, 'epoch': 0.07}


  3%|▎         | 400/15000 [1:21:40<46:35:04, 11.49s/it]

{'loss': 0.5947, 'learning_rate': 4.868e-05, 'epoch': 0.08}


  3%|▎         | 450/15000 [1:30:52<43:38:22, 10.80s/it]

{'loss': 0.5233, 'learning_rate': 4.8513333333333335e-05, 'epoch': 0.09}


  3%|▎         | 500/15000 [1:40:03<43:32:20, 10.81s/it]

{'loss': 0.3886, 'learning_rate': 4.834666666666667e-05, 'epoch': 0.1}


  4%|▎         | 550/15000 [1:49:22<44:09:39, 11.00s/it]

{'loss': 0.4816, 'learning_rate': 4.818333333333334e-05, 'epoch': 0.11}


  4%|▍         | 600/15000 [1:58:48<43:35:32, 10.90s/it]

{'loss': 0.6548, 'learning_rate': 4.801666666666667e-05, 'epoch': 0.12}


  4%|▍         | 650/15000 [2:09:04<56:46:26, 14.24s/it]

{'loss': 0.5939, 'learning_rate': 4.785e-05, 'epoch': 0.13}


  5%|▍         | 683/15000 [2:16:47<55:38:43, 13.99s/it]2025/02/09 02:16:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run rebellious-bug-72 at: http://localhost:5000/#/experiments/562578959489541605/runs/2d1441433e2546d780c610868f0eebcb.
2025/02/09 02:16:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/562578959489541605.


KeyboardInterrupt: 

### Model Load in Memory and do the predictions

In [1]:
import torch
from transformers import BartForSequenceClassification, BartTokenizer

# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the fine-tuned model in Google Drive
model_path = r"C:\Project\NLP_Assignment_2\DeepDeployers_A2_MLOPs\Final_trained_model\content\fine_tuned_bart"

# Load the fine-tuned model and tokenizer
model = BartForSequenceClassification.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model.to(device)  # Move model to GPU if available
model.eval()  # Set model to evaluation mode






  from .autonotebook import tqdm as notebook_tqdm
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): L

In [2]:
def predict_sentiment(text):
    """Predict sentiment (Positive/Negative) for a given text review."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    with torch.no_grad():  # No need to calculate gradients
        outputs = model(**inputs)
    
    prediction = torch.argmax(outputs.logits, dim=1).item()
    sentiment = "Positive" if prediction == 1 else "Negative"
    
    return {"text": text, "sentiment": sentiment}

In [4]:

review_text = """At the outset, Badass Ravi Kumar sets the tone with a disclaimer: “This film is all about badass logic, and here logic is optional.” Promising to deliver the over-the-top, dialogue-heavy entertainment of the 80s, it certainly stays true to its claim. But does that make it entertaining? Unfortunately, the answer is a resounding NO.
The film suffers from a disjointed narrative that struggles to engage despite its grand ambitions. While action films don’t always require logic, they do need coherence—and Badass Ravi Kumar lacks it entirely. Instead, it often feels like an unintentional parody, making its 141-minute runtime feel more torturous than thrilling—especially with Himesh Reshammiya hogging most of the screen time.
Set in 1989, the story follows Ravi Kumar, a rogue cop determined to eradicate corruption and take down the baddies. Suspended for his politically incorrect methods, he is unofficially sent to Muscat to retrieve a camera reel containing sensitive information about Indian secret agents. This intel is being pursued by Syed Bashir (Manish Wadhwa), Carlos Pedro (Prabhu Deva), and Laila (Kirti Kulhari), while Commissioner Awasthi (Saurabh Sachdeva) entrusts Ravi with the mission.
The film revolves around Ravi Kumar’s larger-than-life persona, but while Reshammiya is tolerable in action sequences, he falters in emotional moments. His chest-thumping patriotism and heavy-duty dialogues (courtesy of Bunty Rathore), such as "Jo Ravi Kumar se ulajhta hai, uske photo pe haar chadh jaata hai," fail to leave a lasting impact. They may generate a quick laugh, but they lack the punch needed to make them truly memorable.
Prabhu Deva, as Carlos Pedro Panther, offers some amusement with his flashy red suit and golden shoes. His action scenes, designed to resemble freestyle choreography, are interesting in parts but ultimately too amateurish for him to be considered a formidable villain. Meanwhile, the supporting cast—including Sanjay Mishra, Johnny Lever, Kirti Kulhari, Sunny Leone and Rajesh Sharma—is criminally underutilised.
Adding to the chaos is an awkwardly forced love triangle between Ravi and two sisters, Laila and Madhubala (Simona J). However, this subplot lacks emotional depth, making it neither engaging nor entertaining. Despite her charming looks, Simona J struggles to emote convincingly, making her performance one of the weakest links in the film.
With excessive characters, chaotic subplots, mindless gun battles, and exaggerated stunts, the film quickly turns into a jumbled mess. Even though the VFX and action sequences are visually polished, they fail to deliver the nostalgic masala experience the film aims for. The soundtrack is equally forgettable, with only "Tere Pyaar Mein" managing to leave any impression.
Even if you leave logic at the door, Badass Ravi Kumar fails to provide even guilty-pleasure entertainment. Himesh Reshammiya’s attempt at a mass-action spectacle is not just unwatchable—it’s an exhausting affair."""
result = predict_sentiment(review_text)

# Print result
print(result)

{'text': 'At the outset, Badass Ravi Kumar sets the tone with a disclaimer: “This film is all about badass logic, and here logic is optional.” Promising to deliver the over-the-top, dialogue-heavy entertainment of the 80s, it certainly stays true to its claim. But does that make it entertaining? Unfortunately, the answer is a resounding NO.\nThe film suffers from a disjointed narrative that struggles to engage despite its grand ambitions. While action films don’t always require logic, they do need coherence—and Badass Ravi Kumar lacks it entirely. Instead, it often feels like an unintentional parody, making its 141-minute runtime feel more torturous than thrilling—especially with Himesh Reshammiya hogging most of the screen time.\nSet in 1989, the story follows Ravi Kumar, a rogue cop determined to eradicate corruption and take down the baddies. Suspended for his politically incorrect methods, he is unofficially sent to Muscat to retrieve a camera reel containing sensitive information 

In [1]:
import json
import torch

# Define the best parameters
best_params = {
    "model_details": {
        "model_name": "facebook/bart-large",
        "num_labels": 2,
        "tokenizer_name": "facebook/bart-large",
        "max_sequence_length": 512
    },
    "training_hyperparameters": {
        "learning_rate": 5e-5,
        "batch_size_train": 8,
        "batch_size_eval": 8,
        "num_train_epochs": 3,
        "weight_decay": 0.01,
        "gradient_accumulation_steps": 2,
        "warmup_steps": 500,
        "adam_epsilon": 1e-8,
        "logging_steps": 10,
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "fp16": True,  # Mixed precision training
        "save_total_limit": 2
    },
    "hardware_details": {
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "num_gpus": torch.cuda.device_count(),
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
    },
    "dataset_details": {
        "train_size": 0.8,
        "val_size": 0.2,
        "total_samples": 10000  # Example, update with actual number
    }
}

# Save to JSON file
with open(r"C:\Project\NLP_Assignment_2\DeepDeployers_A2_MLOPs\src\pipeline\best_model_params.json", "w") as json_file:
    json.dump(best_params, json_file, indent=4)

print("Best model parameters saved successfully!")


Best model parameters saved successfully!


In [2]:
with open("best_model_params.json", "w") as json_file:
    json.dump(best_params, json_file, indent=4)

In [3]:
with open(r"C:\Project\NLP_Assignment_2\DeepDeployers_A2_MLOPs\src\pipeline\best_model_params.json", "r") as json_file:
    loaded_params = json.load(json_file)

# Print loaded parameters in a structured way
print(json.dumps(loaded_params, indent=4))

{
    "model_details": {
        "model_name": "facebook/bart-large",
        "num_labels": 2,
        "tokenizer_name": "facebook/bart-large",
        "max_sequence_length": 512
    },
    "training_hyperparameters": {
        "learning_rate": 5e-05,
        "batch_size_train": 8,
        "batch_size_eval": 8,
        "num_train_epochs": 3,
        "weight_decay": 0.01,
        "gradient_accumulation_steps": 2,
        "warmup_steps": 500,
        "adam_epsilon": 1e-08,
        "logging_steps": 10,
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "fp16": true,
        "save_total_limit": 2
    },
    "hardware_details": {
        "device": "cuda",
        "num_gpus": 1,
        "gpu_name": "NVIDIA GeForce RTX 2060"
    },
    "dataset_details": {
        "train_size": 0.8,
        "val_size": 0.2,
        "total_samples": 10000
    }
}
