In [1]:
!pip install mlflow boto3 awscli optuna lightgbm imbalanced-learn


Collecting mlflow
  Downloading mlflow-3.5.0-py3-none-any.whl.metadata (30 kB)
Collecting boto3
  Downloading boto3-1.40.54-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli
  Downloading awscli-1.42.54-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.5.0 (from mlflow)
  Downloading mlflow_skinny-3.5.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.0 (from mlflow)
  Downloading mlflow_tracing-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow)
  Downloading fastmcp-2.12.4-py3-none-any.whl.metadata (19 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlfl

In [2]:
import os, getpass

os.environ["AWS_ACCESS_KEY_ID"] = getpass.getpass("Enter AWS Access Key ID: ")
os.environ["AWS_SECRET_ACCESS_KEY"] = getpass.getpass("Enter AWS Secret Access Key: ")
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

Enter AWS Access Key ID: ··········
Enter AWS Secret Access Key: ··········


In [3]:
import boto3
s3 = boto3.client('s3')
print(s3.list_buckets())

{'ResponseMetadata': {'RequestId': '7XBHN1SGT21DK0TQ', 'HostId': 'z/w8/NdVygL5vRj6VSYp/Zhvjke/3bRPlXmu5M7pGTipF5Xe6Ltcx7WAO5G82PmokpfdZhniSfc=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'z/w8/NdVygL5vRj6VSYp/Zhvjke/3bRPlXmu5M7pGTipF5Xe6Ltcx7WAO5G82PmokpfdZhniSfc=', 'x-amz-request-id': '7XBHN1SGT21DK0TQ', 'date': 'Fri, 17 Oct 2025 02:56:12 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Buckets': [{'Name': 'project1-mlflow-bucket', 'CreationDate': datetime.datetime(2025, 10, 12, 12, 4, 13, tzinfo=tzlocal())}], 'Owner': {'DisplayName': 'adityasahusomu02', 'ID': '866466d6c4d8a9893e39cdce3c468d75133f23de2c907deeda4f1a78a7ca565b'}}


In [4]:
import mlflow
mlflow.set_tracking_uri("http://ec2-3-15-32-230.us-east-2.compute.amazonaws.com:5000/")

print("Tracking URI:", mlflow.get_tracking_uri())

Tracking URI: http://ec2-3-15-32-230.us-east-2.compute.amazonaws.com:5000/


In [5]:
mlflow.set_experiment("exp 8 -Naive_Bayes with HP Tuning")

2025/10/17 03:34:22 INFO mlflow.tracking.fluent: Experiment with name 'exp 8 -Naive_Bayes with HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://project1-mlflow-bucket/355066078437915299', creation_time=1760672062926, experiment_id='355066078437915299', last_update_time=1760672062926, lifecycle_stage='active', name='exp 8 -Naive_Bayes with HP Tuning', tags={}>

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna

In [7]:
url = 'https://raw.githubusercontent.com/adityasahusomu/Youtube_Comment_Analyzer/refs/heads/main/cleaned_reddit_dataset.csv'
df = pd.read_csv(url).dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [8]:
df = df.dropna(subset=['category'])

ngram_range = (1, 3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():

        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        mlflow.log_param("algo_name", model_name)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        mlflow.sklearn.log_model(model, f"{model_name}_model")


def objective_mnb(trial):
    alpha = trial.suggest_float('alpha', 1e-4, 1.0, log=True)

    model = MultinomialNB(alpha=alpha)
    return accuracy_score(y_test, model.fit(X_train, y_train).predict(X_test))


def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_mnb, n_trials=30)

    best_params = study.best_params
    best_model = MultinomialNB(alpha=best_params['alpha'])

    log_mlflow("MultinomialNB", best_model, X_train, X_test, y_train, y_test)

# Multinomial Naive Bayes
run_optuna_experiment()


[I 2025-10-17 03:37:48,229] A new study created in memory with name: no-name-45f3dce2-88c1-4ba2-8953-ac0efe0fdabc
[I 2025-10-17 03:37:48,239] Trial 0 finished with value: 0.6679348974846755 and parameters: {'alpha': 0.05356822865823907}. Best is trial 0 with value: 0.6679348974846755.
[I 2025-10-17 03:37:48,248] Trial 1 finished with value: 0.6682519551891778 and parameters: {'alpha': 0.027312314075643912}. Best is trial 1 with value: 0.6682519551891778.
[I 2025-10-17 03:37:48,257] Trial 2 finished with value: 0.6645529486366518 and parameters: {'alpha': 0.9993953908062037}. Best is trial 1 with value: 0.6682519551891778.
[I 2025-10-17 03:37:48,265] Trial 3 finished with value: 0.6664552948636652 and parameters: {'alpha': 0.2410013886979918}. Best is trial 1 with value: 0.6682519551891778.
[I 2025-10-17 03:37:48,273] Trial 4 finished with value: 0.6682519551891778 and parameters: {'alpha': 0.0006452435253884178}. Best is trial 1 with value: 0.6682519551891778.
[I 2025-10-17 03:37:48,28

🏃 View run MultinomialNB_SMOTE_TFIDF_Trigrams at: http://ec2-3-15-32-230.us-east-2.compute.amazonaws.com:5000/#/experiments/355066078437915299/runs/42b5aa4954c74cc5af3ca531f702b5ea
🧪 View experiment at: http://ec2-3-15-32-230.us-east-2.compute.amazonaws.com:5000/#/experiments/355066078437915299
