In [1]:
# Install required library
!pip install nltk mlflow scikit-learn pandas numpy

# Import all necessary packages
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import re
import string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


Collecting protobuf<7,>=3.12.0
  Using cached protobuf-6.33.0-cp310-abi3-win_amd64.whl (436 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.8
    Uninstalling protobuf-4.25.8:
      Successfully uninstalled protobuf-4.25.8
Successfully installed protobuf-6.33.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('IMDB.csv')
df = df.sample(500)
df.to_csv('data.csv', index=False)
df.head()

Unnamed: 0,review,sentiment
940,For me personally this film goes down in my to...,positive
437,Oh dear. I was so disappointed that this movie...,negative
442,"quite good, don't expect anything high culture...",positive
693,"Every movie from the thirties is dated, but if...",negative
484,Another attempt by modern Japanese directors t...,positive


In [3]:
# data preprocessing

# Define text preprocessing functions
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    """Convert text to lower case."""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('؛', "")
    text = re.sub('\s+', ' ', text).strip()
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalize_text(df):
    """Normalize the text data."""
    try:
        df['review'] = df['review'].apply(lower_case)
        df['review'] = df['review'].apply(remove_stop_words)
        df['review'] = df['review'].apply(removing_numbers)
        df['review'] = df['review'].apply(removing_punctuations)
        df['review'] = df['review'].apply(removing_urls)
        df['review'] = df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [4]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
940,personally film go top four time exception jam...,positive
437,oh dear disappointed movie rip off japan s rin...,negative
442,quite good expect anything high culture the ac...,positive
693,every movie thirty dated watch john ford movie...,negative
484,another attempt modern japanese director redef...,positive


In [5]:
df['sentiment'].value_counts()

sentiment
negative    251
positive    249
Name: count, dtype: int64

In [6]:
x = df['sentiment'].isin(['positive','negative'])
df = df[x]

In [7]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
df.head()

Unnamed: 0,review,sentiment
940,personally film go top four time exception jam...,1
437,oh dear disappointed movie rip off japan s rin...,0
442,quite good expect anything high culture the ac...,1
693,every movie thirty dated watch john ford movie...,0
484,another attempt modern japanese director redef...,1


In [8]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
import os
import dagshub
import mlflow
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set MLflow tracking URI from environment variable
mlflow_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(mlflow_uri)

# Initialize dagshub with credentials from environment variables
dagshub.init(
    repo_owner=os.getenv("DAGSHUB_REPO_OWNER"),
    repo_name=os.getenv("DAGSHUB_REPO_NAME"),
    mlflow=True
)

# Configuration dictionary with environment variables
config = {
    "data_path": os.getenv("DATA_PATH", "notebooks/data.csv"),
    "test_size": 0.2,
    "mlflow_tracking_uri": os.getenv("MLFLOW_TRACKING_URI"),
    "dagshub_repo_owner": os.getenv("DAGSHUB_REPO_OWNER"),
    "dagshub_repo_name": os.getenv("DAGSHUB_REPO_NAME")
}

# Set up MLflow experiment
mlflow.set_experiment("Logistic Regression Baseline")

# Example of logging parameters and metrics
with mlflow.start_run():
    # Log actual parameters from your configuration
    for key, value in config.items():
        mlflow.log_param(key, value)

🏃 View run bright-auk-771 at: https://dagshub.com/ayusprasad/capstone-project.mlflow/#/experiments/0/runs/0b38ad5d1f244d4a9b53532c8e2d46e5
🧪 View experiment at: https://dagshub.com/ayusprasad/capstone-project.mlflow/#/experiments/0


In [12]:
import mlflow
import logging
import os
import time
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Start MLflow run
with mlflow.start_run():
    start_time = time.time()

    # Log parameters
    mlflow.log_param("vectorizer", "Bag of Words")
    mlflow.log_param("num_features", 100)
    mlflow.log_param("test_size", 0.25)

    # Train model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    mlflow.log_param("model", "Logistic Regression")

    # Predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # ✅ Save model locally (fixes: "Path already exists and is not empty")
    local_model_path = "logged_models/logistic_regression"
    if os.path.exists(local_model_path):
        shutil.rmtree(local_model_path)  # remove existing folder safely
    os.makedirs(local_model_path, exist_ok=True)

    mlflow.sklearn.save_model(model, local_model_path)

    # Optionally, log model as artifact instead of model registry
    mlflow.log_artifact(local_model_path)

    end_time = time.time()
    logging.info(f"Completed in {end_time - start_time:.2f} seconds.")


🏃 View run awesome-trout-667 at: https://dagshub.com/ayusprasad/capstone-project.mlflow/#/experiments/0/runs/7978ed16f8a84611bc873825fab67fb6
🧪 View experiment at: https://dagshub.com/ayusprasad/capstone-project.mlflow/#/experiments/0
