In [51]:
import mlflow
import mlflow.sklearn

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import logging
import os
import time


In [32]:
df = pd.read_csv('IMDB.csv')
df.shape

(1000, 2)

In [33]:
# Let's choose a small sample for exploration and save it into data.csv
df = df.sample(500)
df.to_csv('data.csv', index=False)

In [34]:
df.head()

Unnamed: 0,review,sentiment
28,"Uzak (2002), a Turkish film shown in the U.S. ...",positive
516,"Back in 1994, I had a really lengthy vacation ...",positive
689,"This movie surprised me. Some things were ""cli...",positive
565,This reminds me of when I was a born-again bel...,negative
595,"In a future society, the military component do...",positive


In [35]:
df['sentiment'].value_counts()

sentiment
negative    263
positive    237
Name: count, dtype: int64

In [36]:
# Data preprocessing 

LEMMATIZER = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

def preprocess_text(text):
    # Step 1: Lowercase
    text = text.lower()

    # Step 2: Removing URLS
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    # Step 3: Removing Punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('؛', "")

    # Step 4: Removing Numbers
    text = re.sub(r'\d+', '', text)

    # Step 5 & 6: Tokenize, Remove Stop Words, and Lemmatize
    words = text.split()

    processed_words = [LEMMATIZER.lemmatize(word) for word in words if word not in STOP_WORDS]

    return " ".join(processed_words)

In [37]:
def normalize_text(df, col='review'):
    try:
        df[col] = df[col].apply(preprocess_text)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [38]:
df_normalized = normalize_text(df)

In [39]:
df_normalized.head()

Unnamed: 0,review,sentiment
28,uzak turkish film shown u distantbr br directe...,positive
516,back really lengthy vacation around fourth jul...,positive
689,movie surprised thing clicheish technological ...,positive
565,reminds bornagain believer going minister neve...,negative
595,future society military component recruit rath...,positive


In [41]:
df_normalized['sentiment'].value_counts()

sentiment
negative    263
positive    237
Name: count, dtype: int64

In [42]:
df_normalized.isnull().sum()

review       0
sentiment    0
dtype: int64

In [43]:
df_normalized['sentiment'] =  df_normalized['sentiment'].map({
    'positive': 1,
    'negative': 0
})

df_normalized.head()

Unnamed: 0,review,sentiment
28,uzak turkish film shown u distantbr br directe...,1
516,back really lengthy vacation around fourth jul...,1
689,movie surprised thing clicheish technological ...,1
565,reminds bornagain believer going minister neve...,0
595,future society military component recruit rath...,1


In [44]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df_normalized['review'])
y = df_normalized['sentiment']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [46]:
X_train.shape

(375, 100)

In [47]:
X_test.shape

(125, 100)

In [49]:
import dagshub

In [50]:
mlflow.set_tracking_uri('https://dagshub.com/saxenaabhinav113/MLops-Project.mlflow')
dagshub.init(repo_owner='saxenaabhinav113', repo_name='MLops-Project', mlflow=True)

mlflow.set_experiment("Logistic Regression Baseline")



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=8b7cfb3c-5b6a-48c9-8ca6-d9888de57796&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=06da4bb68d5ee2f3d58a0ee94526c84f3dd478815a5829144fa1696f4ab3c2f9




2025/08/17 15:28:06 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/88bc403d7a0f487188a3ad0cda250abd', creation_time=1755424686477, experiment_id='0', last_update_time=1755424686477, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>