In [14]:
# imports

import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [15]:
# for experimentation purpose, we will sample 500 rows from the original dataset
df = pd.read_csv('IMDB.csv')
df = df.sample(500)
df.to_csv('data.csv', index=False)
df.head()

Unnamed: 0,review,sentiment
3345,Below average movie with poor music considerin...,negative
28743,This is a movie which attempts a retelling of ...,negative
21125,I haven't seen this in over 20yrs but I still ...,positive
20186,"Wow, where do I begin? After suffering through...",negative
45506,The great and underrated Marion Davies shows h...,positive


In [16]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STARNET\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STARNET\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def cleaning_pipeline(text):    
    """Complete text cleaning pipeline."""
    if not isinstance(text, str):
        return ""
    
    # 1. Lowercase & URLs
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # 2. Numbers & Punctuation
    text = re.sub(r'\d+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('ÿõ', "")
    
    # 3. Tokenize, Stopwords, and Lemmatization
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    
    return " ".join(words).strip()

def normalize_text(df):
    """Normalize text data in the DataFrame."""
    try:
        df['review'] = df['review'].apply(cleaning_pipeline)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [18]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
3345,average movie poor music considering movie bas...,negative
28743,movie attempt retelling thai history set ancie...,negative
21125,seen yr still remember thing br br film could ...,positive
20186,wow begin suffering wretched rental thanks lot...,negative
45506,great underrated marion davy show stuff late s...,positive


In [19]:
df['sentiment'].value_counts()

sentiment
negative    252
positive    248
Name: count, dtype: int64

In [20]:
x = df['sentiment'].isin(['positive','negative'])
df = df[x]

In [21]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
df.head()

Unnamed: 0,review,sentiment
3345,average movie poor music considering movie bas...,0
28743,movie attempt retelling thai history set ancie...,0
21125,seen yr still remember thing br br film could ...,1
20186,wow begin suffering wretched rental thanks lot...,0
45506,great underrated marion davy show stuff late s...,1


In [22]:
# check for null values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [23]:
# CountVectorizer: converting text data into a matrix of token counts.
vectorizer = CountVectorizer(max_features=100)

# create feature matrix and target vector
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [25]:
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/aashu-0/MLOps_Learning_Project.mlflow')
dagshub.init(repo_owner='aashu-0', repo_name='MLOps_Learning_Project', mlflow=True)

mlflow.set_experiment("Logistic Regression Baseline")

2025-12-26 03:50:14,704 - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/aashu-0/MLOps_Learning_Project "HTTP/1.1 200 OK"


2025-12-26 03:50:14,746 - INFO - Initialized MLflow to track repo "aashu-0/MLOps_Learning_Project"


2025-12-26 03:50:14,755 - INFO - Repository aashu-0/MLOps_Learning_Project initialized!


<Experiment: artifact_location='mlflow-artifacts:/61fc206f6cfb47c2bbed0ab34ebba037', creation_time=1766700815810, experiment_id='0', last_update_time=1766700815810, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [28]:
import mlflow
import logging
import os
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logging.info("Starting MLflow run...")

with mlflow.start_run():
    start_time = time.time()
    
    try:
        logging.info("Logging preprocessing parameters...")
        mlflow.log_param("vectorizer", "Bag of Words")
        mlflow.log_param("num_features", 100)
        mlflow.log_param("test_size", 0.25)

        logging.info("Initializing Logistic Regression model...")
        model = LogisticRegression(max_iter=1000)  # Increase max_iter to prevent non-convergence issues

        logging.info("Fitting the model...")
        model.fit(X_train, y_train)
        logging.info("Model training complete.")

        logging.info("Logging model parameters...")
        mlflow.log_param("model", "Logistic Regression")

        logging.info("Making predictions...")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info("Logging evaluation metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model, name = "model")

        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")

        # Print the results for verification
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)


2025-12-26 03:57:36,439 - INFO - Starting MLflow run...
2025-12-26 03:57:37,817 - INFO - Logging preprocessing parameters...
2025-12-26 03:57:38,984 - INFO - Initializing Logistic Regression model...
2025-12-26 03:57:38,988 - INFO - Fitting the model...
2025-12-26 03:57:39,047 - INFO - Model training complete.
2025-12-26 03:57:39,051 - INFO - Logging model parameters...
2025-12-26 03:57:39,444 - INFO - Making predictions...
2025-12-26 03:57:39,446 - INFO - Calculating evaluation metrics...
2025-12-26 03:57:39,508 - INFO - Logging evaluation metrics...
2025-12-26 03:57:41,077 - INFO - Saving and logging the model...
2025-12-26 03:58:02,340 - INFO - Model training and logging completed in 24.52 seconds.
2025-12-26 03:58:02,343 - INFO - Accuracy: 0.656
2025-12-26 03:58:02,346 - INFO - Precision: 0.6610169491525424
2025-12-26 03:58:02,351 - INFO - Recall: 0.6290322580645161
2025-12-26 03:58:02,357 - INFO - F1 Score: 0.6446280991735537


üèÉ View run capricious-fly-742 at: https://dagshub.com/aashu-0/MLOps_Learning_Project.mlflow/#/experiments/0/runs/1dfe31baad2343ba8c601401e034ca00
üß™ View experiment at: https://dagshub.com/aashu-0/MLOps_Learning_Project.mlflow/#/experiments/0
